Here, I analyse and document my progress with the analysis of a world-wide whole genome sampling of Zymoseptoria tritici. Some of the analysis are just exploratory while some other are lined up in a clear path to answering questions related to the history of Z.tritici and to better understand its adaptation to various climates.
library(knitr)
library(reticulate)
#Spatial analyses packages
library(raster)
library(sp)
library(rgdal)
library(maps)
#Data wrangling and data viz
library(tidyverse)
library(RColorBrewer)
library(plotly)
library(cowplot)
library(GGally)
library(corrplot)
library(pheatmap)
library(ggstance)
library('pophelper')
library(ggbiplot)
library(igraph)
library(ggraph)
#Pop structure and pop genomic
library(SNPRelate) #PCA
library(LEA) #Clustering
library(pophelper)
library(PopGenome) #Summary statistics
library(gridExtra)
library(ggtree)
library(tidytree)
#GEA
library(lfmm)
#Statistics
library(car)
library(corrr)
library(lsmeans)
library(multcomp)
#Variables
world <- map_data("world")
project_dir="/Users/feurtey/Documents/Postdoc_Bruce/Projects/WW_project/"
#Data directories
data_dir=paste0(project_dir, "0_Data/")
metadata_dir=paste0(project_dir, "Metadata/")
#Analysis directories
#--------------------
VAR_dir = paste0(project_dir, "1_Variant_calling/")
vcf_dir = paste0(VAR_dir, "4_Joint_calling/")
PopStr_dir = paste0(project_dir, "2_Population_structure/")
Sumstats_dir = paste0(project_dir, "3_Sumstats_demography/")
TE_RIP_dir=paste0(project_dir, "4_TE_RIP/")
RIP_DIR=paste0(TE_RIP_dir, "0_RIP_estimation/")
DIM2_DIR=paste0(TE_RIP_dir, "1_Blast_from_denovo_assemblies/")
GEA_dir=paste0(project_dir, "5_GEA/")
fung_dir=paste0(project_dir, "6_Fungicide_resistance/")
virulence_dir = paste0(project_dir, "7_Virulence/")
sel_dir = paste0(project_dir, "8_Selection/")
gene_list_dir = paste0(sel_dir, "0_Lists_unique_copy/")
#Files
#vcf_name=paste0(project_dir, "Ztritici_global_March2020.filtered-clean.SNP.max-m-0.8.maf-0.05.thin-1000bp")
vcf_name="Ztritici_global_March2020.filtered-clean.SNP.max-m-0.8.maf-0.05.thin-1000bp"
#vcf_name_nomaf=paste0(project_dir, "Ztritici_global_March2020.filtered-clean.SNP.max-m-0.8.thin-1000bp")
vcf_name_nomaf="Ztritici_global_March2020.filtered-clean.SNP.max-m-0.8.thin-1000bp"
Zt_list = paste0(metadata_dir, "Ztritici_list")
gff_file = paste0(data_dir, "Zymoseptoria_tritici.MG2.Grandaubert2015.gff3")
ref_fasta_file = paste0(data_dir, "Zymoseptoria_tritici.MG2.dna.toplevel.mt+.fa")
gene_annotation = read_tsv(paste0(data_dir, "Badet_GLOBAL_PANGENOME_TABLE.txt"))
Sys.setenv(PROJECTDIR=project_dir)
Sys.setenv(VARDIR=VAR_dir)
Sys.setenv(VCFDIR=vcf_dir)
Sys.setenv(POPSTR=PopStr_dir)
Sys.setenv(SUMST=Sumstats_dir)
Sys.setenv(GEADIR=GEA_dir)
Sys.setenv(ZTLIST=Zt_list)
Sys.setenv(GFFFILE = gff_file)
Sys.setenv(REFFILE = ref_fasta_file)
Sys.setenv(VCFNAME=vcf_name)
Sys.setenv(VCFNAME_NOMAF=vcf_name_nomaf)
#knitr::opts_chunk$set(echo = F)
knitr::opts_chunk$set(message = F)
knitr::opts_chunk$set(warning = F)
knitr::opts_chunk$set(results = T)
#py_install(packages = 'biopython')
#Prettier correlation colors
mycolorsCorrel<- colorRampPalette(c("#0f8b8d", "white", "#a8201a"))(20)
greens=c("#1B512D", "#669046", "#8CB053", "#B1CF5F", "#514F59")
blues=c("#08386E", "#1C89C9", "#28A7C0", "#B0DFE8", "grey")
# 1 - Variant calling
rsync -avP \
alice@130.125.25.187:/data2/alice/WW_project/1_Variant_calling/1* \
/Users/feurtey/Documents/Postdoc_Bruce/Projects/WW_project/1_Variant_calling/
rsync -avP \
alice@130.125.25.187:/data2/alice/WW_project/1_Variant_calling/2* \
/Users/feurtey/Documents/Postdoc_Bruce/Projects/WW_project/
# 4 - TE and RIP
rsync -avP \
alice@130.125.25.187:/data2/alice/WW_project/4_TE_RIP/0_RIP_estimation/*txt \
/Users/feurtey/Documents/Postdoc_Bruce/Projects/WW_project/4_TE_RIP/0_RIP_estimation/
#Composite_index.txt
#GC_percent.txt
#Nb_reads.txt
#Nb_reads_per_TE.txt
rsync -avP \
alice@130.125.25.187:/data2/alice/WW_project/4_TE_RIP/1_Blast_from_denovo_assemblies/1_Blast_dim2_deRIPped/*txt \
/Users/feurtey/Documents/Postdoc_Bruce/Projects/WW_project/4_TE_RIP/1_Blast_from_denovo_assemblies/
vcftools --vcf ${VCFDIR}${VCFNAME}.recode.vcf \
--depth --out ${VCFDIR}${VCFNAME}
depth = read_delim(paste0(vcf_dir, vcf_name, ".idepth"), delim = "\t")
Zt_meta=readxl::read_excel(paste0(metadata_dir, "Zt_global_data_set_09April2020_onlyWW.xlsx"),
sheet = 1, n_max = 1000) %>%
filter(Species == "Ztritici") %>%
unite(Coordinates, Latitude, Longitude, sep = ";", remove = F)
Zt_meta_future=readxl::read_excel(paste0(metadata_dir, "New_sequencing.xlsx"),
sheet = 1, n_max = 1000)
Zt_meta = Zt_meta %>%
filter(is.na(Duplicate_Clone_LowQual_Flags) | Duplicate_Clone_LowQual_Flags != "DUPLICATE_remove") %>%
filter(!is.na(Country))
Zt_meta %>%
filter(is.na(Duplicate_Clone_LowQual_Flags) | Duplicate_Clone_LowQual_Flags != "DUPLICATE_remove") %>%
inner_join(., depth %>% filter(MEAN_DEPTH >= 5), by = c("ID_file" = "INDV")) %>%
dplyr::select(ID_file) %>% write_delim(paste0(Zt_list, ".txt"))
Zt_meta$Collection[is.na(Zt_meta$Collection)] <- "Other"
#Define stable colors
#myColors <- brewer.pal(7, "Dark2")
myColors <- c("#04078B", "#a10208", "#FFBA08", "#CC0044", "#5C9D06", "#129EBA","#305D1B")
names(myColors) = levels(factor(Zt_meta$Continent_Region))
Color_Continent = ggplot2::scale_colour_manual(name = "Continent_Region", values = myColors)
Fill_Continent = ggplot2::scale_fill_manual(name = "Continent_Region", values = myColors)
The sampling of Z.tritici isolated from the natural environment covers almost the entirety of the wheat-grown continents. It is, however, highly heterogeous. Europe has the highest sampling density. Several locations are heavily sampled, such a fields in Switzerland or the US. Some of the available genomes are still under embargo (waiting for the publication of their creator). We are also thinking about sequencing more genomes. Here you can view these different datasets on a map. Please select and unselect the different values to have a view of the changes with added datasets.
Zt_meta_for_shiny = bind_rows(
Zt_meta_future %>%
mutate(Sampling_collection = "Future sequencing"),
Zt_meta %>%
filter(!is.na(Latitude)) %>%
mutate(Sampling_collection = ifelse((Duplicate_Clone_LowQual_Flags != "Publication_embargo" |
is.na(Duplicate_Clone_LowQual_Flags)),"Usable_genomes",
"Embargoed genomes")))
kable(Zt_meta_for_shiny %>% dplyr::count(Sampling_collection, name = "Number of genomes"))
| Sampling_collection | Number of genomes |
|---|---|
| Embargoed genomes | 148 |
| Future sequencing | 224 |
| Usable_genomes | 727 |
max_circle = max(Zt_meta_for_shiny %>%
dplyr::count(Latitude, Longitude, name = "Number_genomes") %>%
dplyr::select(Number_genomes))
combinations=list(
c("Usable genomes", "Usable_genomes"), c("Usable genomes and JGI", "Usable_genomes", "Embargoed genomes"),
c("Present and future", "Usable_genomes", "Embargoed genomes", "Future sequencing"))
temp_list = list()
for (i in c(1:3)) {
y = Zt_meta_for_shiny %>%
filter(Sampling_collection %in% combinations[[i]]) %>%
dplyr::count(Country, Latitude, Longitude, name = "Number_genomes") %>%
filter(Number_genomes > 0) %>%
mutate(Which_isolates = combinations[[i]][[1]])
temp_list[[i]] = y
}
temp = do.call(rbind, temp_list)
temp = Zt_meta_for_shiny %>%
filter(Sampling_collection == "Usable_genomes" | Sampling_collection == "Embargoed genomes") %>%
dplyr::count(Country, Latitude, Longitude, name = "Number_genomes") %>%
filter(Number_genomes > 0)
isolate_map = ggplot() + theme_void() +
geom_polygon(data = world, aes(x=long, y = lat, group = group), fill="#ede7e3", alpha=0.7) +
geom_point (data = temp, aes(x=as.numeric(Longitude), y=as.numeric(Latitude),
size=Number_genomes,
text = Country),
alpha = 0.6, color = "#16697a") +
scale_size("Number of genomes", limits = c(1, max_circle))
#ggplotly(isolate_map)
#isolate_map
temp2 = Zt_meta_for_shiny %>%
filter(Sampling_collection == "Future sequencing") %>%
dplyr::count(Country, Latitude, Longitude, name = "Number_genomes") %>%
filter(Number_genomes > 0)
isolate_map +
geom_point (data = temp2, aes(x=as.numeric(Longitude), y=as.numeric(Latitude),
size=Number_genomes,
text = Country),
alpha = 0.6, color = "#FFBA08") +
scale_size("Number of genomes", limits = c(1, max_circle))
The sampling also covers a wide range of years: starting from 1990 to 2017. Just as with the geographical repartition, some years are heavily sampled, reflecting sampling in specific fields done for previous experiments.
temp = as_tibble(c(min(Zt_meta_for_shiny$Date_Year, na.rm = T) : max(Zt_meta_for_shiny$Date_Year, na.rm = T))) %>%
mutate(`Sampling year` = as.character(value))
sum_temp = Zt_meta_for_shiny %>%
mutate(`Sampling year` = as.character(Date_Year)) %>%
dplyr::count(`Sampling year`, Continent_Region) %>%
full_join(., temp) %>%
mutate(`Genome number` = replace_na(n, 0))
sum_temp %>%
ggplot(aes(x=`Sampling year`, y =`Genome number`, fill = Continent_Region)) +
geom_bar(stat = "identity") +
theme_bw() + theme(axis.title = element_blank(),
axis.text.x = element_text(angle = 60, hjust = 1)) +
Fill_Continent
Question: How prelavent is aneuploidy in natural populations of Z.tritici? In the case of accessory chromosomes, is there a correlation between phylogeny, environment, host or time and the presence/absence of some chromosomes?
Methods: Based on the depth of coverage for all samples, we can identify for both core and accessory chromosomes whether each isolates includes 1, 0 or several copies. Caveat: I cheated and used the depth from the vcf file with the SNP subset to get a preliminary analysis. The proper analysis should be done on the bam files.
#NB: this table is currently not used. See TODO in next chunk.
nb_sites_table=${VCFDIR}Nb_sites.txt
echo "CHROM NB_SITES" > ${nb_sites_table}
for i in {1..21};
do
vcftools --vcf ${VCFDIR}${VCFNAME}.recode.vcf \
--out ${VCFDIR}${VCFNAME}.chr_${i} \
--chr ${i} \
--depth &> \
/${VCFDIR}${VCFNAME}.chr_${i}.log ;
nb_sites=$(grep "After filtering" ${VCFDIR}${VCFNAME}.chr_${i}.log \
| grep "Sites" | cut -f 4 -d " ")
echo chr_${i} ${nb_sites} >> $nb_sites_table
done
rm ${VCFDIR}${VCFNAME}.chr_*.log
depth_prefix = paste0(vcf_dir, vcf_name, ".chr_")
depth_suffix = ".idepth"
depth_results = list()
coverage_results = list()
d = 0
for (i in c(1:21)) {
d = d + 1
temp = read.table(paste0(depth_prefix, i, depth_suffix), header = TRUE, na.strings = "NaN")
temp$CHROM = i
temp$NORM_N_SITES = temp$N_SITES/max(temp$N_SITES) #TODO shift to using table created above
depth_results[[d]] = temp
}
depth = bind_rows(depth_results)
depth$MEAN_DEPTH[is.na(depth$MEAN_DEPTH)] = 0
summary_depth = depth[depth$CHROM < 14,] %>%
dplyr::group_by(INDV) %>%
dplyr::summarise(core_genome_depth = median(MEAN_DEPTH, na.rm = TRUE), coverage = sum(N_SITES))
depth =full_join(depth, summary_depth[,c(1,2)])
depth$NORM_MEAN_DEPTH = depth$MEAN_DEPTH / depth$core_genome_depth
depth$NORM_MEAN_DEPTH_CAPPED = ifelse(depth$NORM_MEAN_DEPTH < 2, depth$NORM_MEAN_DEPTH, 2)
depth = inner_join(Zt_meta, depth, by = c("ID_file" = "INDV")) %>%
mutate(Sample = fct_reorder(ID_file, Date_Year)) %>%
mutate(Sample = fct_reorder(Sample, Country)) %>%
mutate(Sample = fct_reorder(Sample, Continent_Region))
In the heatmap, I represent the depth normalized by the median depth over all core chromosomes. As expected, the copy-number variation at the chromosome scale affects mostly the accessory chromosomes (AC). There is some presence of supernumerary AC and a lot of presence-absence variation. Supernumerary chromosomes can also be found in the core chromosomes but this is almost anecdotal as over the whole sampling this was found only in 9 cases.
heatmap_depth = ggplot(data = depth, aes(x = CHROM, y=Sample, fill=NORM_MEAN_DEPTH)) +
geom_tile() + scale_fill_gradient(low="white", high = "#16697a") +
geom_vline(xintercept = 13.5, linetype = "longdash", colour = "gray20") +
theme(axis.text.y = element_blank(),
axis.title.y = element_blank(), axis.ticks.y=element_blank()) +
labs(fill = "Depth") + xlim(c(0.5, 21.5))
labs(x= "Chromosome")
## $x
## [1] "Chromosome"
##
## attr(,"class")
## [1] "labels"
plot_continent = ggplot(data = depth, aes(x = 1, y=Sample, fill=Continent_Region)) +
geom_tile(aes(width = 2)) +
theme_classic() +
theme(axis.text.y = element_blank(), axis.ticks.y=element_blank(),
legend.position="left",
axis.text.x=element_text(colour="white")) +
scale_fill_brewer(palette = "Dark2") +
labs(y= "Isolate") + Fill_Continent
plot_grid(plot_continent, heatmap_depth, rel_widths = c(2, 5))
Lthres = 0.50
Hthres = 1.50
depth = depth %>%
dplyr::mutate(Depth_is = ifelse(NORM_MEAN_DEPTH > Hthres, "High",ifelse(NORM_MEAN_DEPTH < Lthres, "Low", "Normal")))
bar_Ndepth_per_CHR =ggplot(depth, aes(x = CHROM, fill = Depth_is)) +
geom_bar(stat = "count") +
scale_fill_manual(values =c("#16697a", "#82c0cc", "#EDE7E3")) +
theme_light()+
labs(x= "Chromosome", y = "Number of isolates")
#lollipop plots
##For high normalized depth values
temp = depth %>%
filter(Depth_is == "High") %>%
dplyr::group_by(CHROM) %>%
dplyr::count()
lolhigh = ggplot(temp, aes(x = as.character(CHROM), y = n)) +
geom_segment( aes(x=as.character(CHROM), xend=as.character(CHROM), y=0, yend=max(temp$n)),
color="grey80", size = 1) +
geom_segment( aes(x=as.character(CHROM), xend=as.character(CHROM), y=0, yend=n),
color="grey20", size = 1) +
geom_point( color="#16697a", size=4) +
geom_text(aes( label = n,
y= n), stat= "identity",
hjust = -0.5, vjust = -0.2) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.x = element_blank(),
axis.text.x = element_text(size = 10),
axis.text.y = element_text(size = 10)
) +
ylim(c(0,max(temp$n)+2+ max(temp$n)*0.1)) +
labs(x= "Chromosome", y = "Number of isolates with supernumerary chromosome") +
coord_flip()
##For low normalized depth values
temp = depth %>%
filter(Depth_is == "Low") %>%
dplyr::group_by(CHROM) %>%
dplyr::count()
lollow = ggplot(temp, aes(x = as.character(CHROM), y = n)) +
geom_segment( aes(x=as.character(CHROM), xend=as.character(CHROM), y=0, yend=max(temp$n)),
color="grey80", size = 1) +
geom_segment( aes(x=as.character(CHROM), xend=as.character(CHROM), y=0, yend=n),
color="grey20", size = 1) +
geom_point( color="#82c0cc", size=4) +
geom_text(aes( label = n,
y= n), stat= "identity",
hjust = -0.5, vjust = -0.2) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.x = element_blank(),
axis.text.x = element_text(size = 10),
axis.text.y = element_text(size = 10)
) +
ylim(c(0,max(temp$n)+ max(temp$n)*0.1)) +
labs( x= "Chromosome", y = "Number of isolates without chromosome") +
coord_flip()
bottom_row <- cowplot::plot_grid(lolhigh, lollow, labels = c('B', 'C'), label_size = 12)
plot_grid(bar_Ndepth_per_CHR, bottom_row, labels = c('A', ''), label_size = 12, ncol = 1)
Here is a table including the isolates with supernumerary core chromosomes.
depth %>%
dplyr::filter(Depth_is == "High") %>%
dplyr::filter(CHROM < 13)
## # A tibble: 17 x 40
## ID_file Isolate_name1 Isolate_name2 Duplicate_Clone… Species Continent_Region
## <chr> <chr> <chr> <chr> <chr> <chr>
## 1 STnnJG… Gp0153947 IPO-92044 Publication_emb… Ztriti… Africa
## 2 STnnJG… Gp0153947 IPO-92044 Publication_emb… Ztriti… Africa
## 3 STnnJG… Gp0153950 IPO-98114 Publication_emb… Ztriti… Europe
## 4 ST_SRR… IPO323 IPO323 Publication_emb… Ztriti… Europe
## 5 ST_SRR… IPO323 IPO323 Publication_emb… Ztriti… Europe
## 6 ST_SRR… IPO323 IPO323 Publication_emb… Ztriti… Europe
## 7 ST_SRR… IPO323 IPO323 Publication_emb… Ztriti… Europe
## 8 STnnJG… Gp0153927 IPO-10013 Publication_emb… Ztriti… Europe
## 9 STnnJG… Gp0153928 IPO-10014 Publication_emb… Ztriti… Europe
## 10 STnnJG… Gp0128316 IPO-95001 Publication_emb… Ztriti… Europe
## 11 STnnJG… Gp0128313 IPO-94243 Publication_emb… Ztriti… North America
## 12 ST16DK… DK15 INRA16-FS0671 <NA> Ztriti… Europe
## 13 ST16FR… FR1 INRA16-FS0794 <NA> Ztriti… Europe
## 14 ST16FR… FR1 INRA16-FS0794 <NA> Ztriti… Europe
## 15 ST13SP… INRA13-LG1979 SeptoDur104 <NA> Ztriti… Europe
## 16 ST90OR… a12_3B_11 <NA> <NA> Ztriti… North America
## 17 ST00AR… ARG BD0069 <NA> <NA> Ztriti… South America
## # … with 34 more variables: Cultivar <chr>, Bread_Durum_Wheat <chr>,
## # Collection <chr>, Plot <dbl>, Collector_Publication <chr>, Country <chr>,
## # Region_1 <chr>, Region_2 <chr>, City <chr>, Latitude <dbl>,
## # Coordinates <chr>, Longitude <dbl>, Date_Year <dbl>,
## # `Publication/comments` <lgl>, PacBio_Genome <lgl>, Library_strategy <chr>,
## # Phred <chr>, R1_1 <chr>, R1_2 <chr>, R2_1 <chr>, R2_2 <chr>, R1cat <chr>,
## # R2cat <chr>, bam <chr>, gvcf <chr>, N_SITES <int>, MEAN_DEPTH <dbl>,
## # CHROM <int>, NORM_N_SITES <dbl>, core_genome_depth <dbl>,
## # NORM_MEAN_DEPTH <dbl>, NORM_MEAN_DEPTH_CAPPED <dbl>, Sample <fct>,
## # Depth_is <chr>
And an overlook of the accessory chromosomes PAV per continent (only considering continents with more than 10 isolates).
depth %>%
dplyr::filter(CHROM > 13) %>%
dplyr::group_by(Continent_Region, CHROM) %>%
dplyr::mutate(Count_sample_per_continent = n()) %>%
dplyr::filter(Count_sample_per_continent >= 10) %>%
ggplot(aes(x = Continent_Region, fill = Depth_is)) +
geom_bar(position = "fill" ) + facet_wrap(CHROM~.) +
theme_light() +
scale_fill_manual(values = c("#16697a", "#82c0cc", "#EDE7E3")) +
theme(axis.text.x = element_text(angle = 40, hjust = 1)) +
ylab("Proportion of chromosomes") + xlab("")
Question: Could genes PAV be related to phylogeny, environment, host or time? What about predicted effector genes?
Methods: Based on depth of coverage + SGSGeneloss For this section, iI expanded the gene selection from all genes in the pangenome. This means remapping everything unfortunately, but it captures more or the overall diversity and might be the only way to catch some population specific variants.
See PAV from pangenome file (had to be separated because it was way too long to run)
depth_per_gene = read_tsv(paste0(VAR_dir,"2_Depth_per_gene/All_samples.tab")) %>%
left_join(.,readxl::read_excel(paste0(metadata_dir, "Zt_global_data_set_09April2020_DC_AFperso.xlsx"),
sheet = 1, n_max = 1000))
depth_per_PacBio = depth_per_gene %>%
filter(grepl("\\[", Locus_name)) %>%
separate(col = Locus_name,
sep = "\\.",
into= c("PacBio", "rest"),
remove = F,
fill = "left") %>%
group_by(ID_file, PacBio, Continent_Region) %>%
dplyr::summarise(Median_depth = median(as.numeric(Depth), na.rm = T),
Nb_windows = n()) %>%
inner_join(., summary_depth, by=c("ID_file" = "INDV")) %>%
mutate(Normalized_depth = Median_depth / core_genome_depth)
for_pheatmap = spread(depth_per_PacBio[, c(1, 2, 8)], "PacBio", "Normalized_depth")
for_pheatmap = Zt_meta %>% dplyr::select(ID_file, Continent_Region) %>%
inner_join(., for_pheatmap) %>%
filter(complete.cases(.))
temp = as.data.frame(for_pheatmap)
rownames(temp) = temp[,1]
temp = temp[,-1]
my_palette <- colorRampPalette(c("white", "black"))(n = 299)
annot_continent = as.data.frame(temp[, 1])
row.names(annot_continent) <- row.names(temp)
colnames(annot_continent) <- "Continent"
color_PacBio = data.frame(Continent = c("Europe", "Europe", "Europe", "Europe", "South America",
"Oceania", "South America", "North America", "Europe",
"North America", "Middle East", "Middle East", "Middle East", "Africa",
"North America", "Africa", "Europe", "Middle East"))
rownames(color_PacBio) <- c("1A5", "1E4", "3D1", "3D7", "Arg00_1D1a1",
"Aus01_1H8", "CH95_3B2a", "CNR93_D3a1", "CRI10_0616",
"I93_11a_2", "IR01_26b", "IR01_48b", "ISY92_Ar4f", "KE94_MF1a",
"OregS90_a15_4A_10", "TN09_26_2-0", "UR95_E2C_2", "YEQ92_4")
my_colour = list(Continent = myColors)
pheatmap(as.matrix(temp[,c(2:ncol(temp))]),
color=my_palette,
cluster_cols = TRUE,
cluster_rows=TRUE,
border_color = NA,
show_rownames = F,
annotation_row = annot_continent,
annotation_col = color_PacBio,
annotation_colors = my_colour)
Let’s attempt to create a PCA based on the normalized depth per gene.
library(FactoMineR)
depth_genes_PCA = depth_per_gene %>%
dplyr::select(ID_file, Gene_ID, Normalized_depth) %>%
mutate(Normalized_depth = as.numeric(Normalized_depth)) %>%
pivot_wider(names_from = Gene_ID, values_from = Normalized_depth)
temp = as.data.frame(depth_genes_PCA)[complete.cases(depth_genes_PCA),2:ncol(depth_genes_PCA)]
temp2 = temp[, sample(ncol(temp), size=1000,replace=FALSE)]
temp2 = sapply( temp2, as.numeric )
res.pca = PCA(temp2, scale.unit=TRUE, ncp=10, graph=F)
results = as.tibble(res.pca$ind$coord) %>%
mutate(ID_file = as.data.frame(depth_genes_PCA)[complete.cases(depth_genes_PCA),1]) %>%
left_join(.,readxl::read_excel(paste0(metadata_dir, "Zt_global_data_set_09April2020_DC_AFperso.xlsx"),
sheet = 1, n_max = 1000))
p = ggplot(results, aes(x = Dim.1, y = Dim.2, text = ID_file, color = Continent_Region)) +
geom_point(aes(color = Continent_Region)) +
labs(x = paste0("PC 1 (", round(res.pca$eig[1,2]), "%)"),
y = paste0("PC 2 (", round(res.pca$eig[2,2]), "%)")) +
Color_Continent +
theme_bw()
ggplotly(p)
p = ggplot(results, aes(x = Dim.1, y = Dim.2, text = ID_file, color = Collection)) +
geom_point() +
labs(x = paste0("PC 1 (", round(res.pca$eig[1,2]), "%)"),
y = paste0("PC 2 (", round(res.pca$eig[2,2]), "%)")) +
theme_bw()
ggplotly(p)
Different datasets have very different depth of coverage. It is possible that this explains thee clustering observed above. What if I simplified the matrix to include only rounded numbers which would be an approximation of CNV?
library(FactoMineR)
depth_genes_PCA = depth_per_gene %>%
filter(Median_core_depth >= 10 & Median_core_depth <= 50) %>%
dplyr::select(ID_file, Gene_ID, Rounded_norm) %>%
pivot_wider(names_from = Gene_ID, values_from = Rounded_norm)
temp = as.data.frame(depth_genes_PCA)[complete.cases(depth_genes_PCA),2:ncol(depth_genes_PCA)]
temp2 = temp[, sample(ncol(temp), size=1000,replace=FALSE)]
temp2 = sapply( temp2, as.numeric )
res.pca = PCA(temp2, scale.unit=TRUE, ncp=10, graph=F)
results = as.tibble(res.pca$ind$coord) %>%
mutate(ID_file = as.data.frame(depth_genes_PCA)[complete.cases(depth_genes_PCA),1]) %>%
left_join(.,readxl::read_excel(paste0(metadata_dir, "Zt_global_data_set_09April2020_DC_AFperso.xlsx"),
sheet = 1, n_max = 1000))
p = ggplot(results, aes(x = Dim.1, y = Dim.2, text = ID_file, color = Continent_Region)) +
geom_point(aes(color = Continent_Region)) +
labs(x = paste0("PC 1 (", round(res.pca$eig[1,2]), "%)"),
y = paste0("PC 2 (", round(res.pca$eig[2,2]), "%)")) +
Color_Continent +
theme_bw()
ggplotly(p)
p = ggplot(results, aes(x = Dim.2, y = Dim.3, text = ID_file, color = Continent_Region)) +
geom_point(aes(color = Continent_Region)) +
labs(x = paste0("PC 2 (", round(res.pca$eig[1,2]), "%)"),
y = paste0("PC 3 (", round(res.pca$eig[2,2]), "%)")) +
Color_Continent +
theme_bw()
ggplotly(p)
p = ggplot(results, aes(x = Dim.1, y = Dim.2, text = ID_file, color = Collection)) +
geom_point() +
labs(x = paste0("PC 1 (", round(res.pca$eig[1,2]), "%)"),
y = paste0("PC 2 (", round(res.pca$eig[2,2]), "%)")) +
theme_bw()
ggplotly(p)
Hum… Something looks really wrong! Not sure why. Let’s look at it again at some point.
depth_per_gene %>%
filter(Median_core_depth >= 10 & Median_core_depth <= 50) %>%
filter(Rounded_norm > 1) %>%
filter(!is.na(Continent_Region)) %>%
group_by(ID_file, Continent_Region) %>%
summarize(Nb_duplicated_genes = n()) %>%
#filter(Nb_duplicated_genes < 400) %>%
ggplot(aes(x = Continent_Region,
y = Nb_duplicated_genes,
fill = Continent_Region)) +
geom_violin() +
geom_boxplot(color = "white", width=0.08, outlier.shape = NA) +
Fill_Continent +
theme_bw() +
theme(legend.position = "none",
axis.text.x = element_text(size = 10, angle = 20, hjust = 1)) +
labs( x = "", y = "Number of genes",
title = str_wrap(paste0("How many genes have a copy number ",
"higher than one?"), width = 60),
subtitle = "The estimation of the copy number is based on depth of coverage.")
duplicated_genes = depth_per_gene %>%
filter(Median_core_depth >= 10 & Median_core_depth <= 50) %>%
filter(Rounded_norm > 1) %>%
filter(Rounded_norm <= 2) %>%
dplyr::filter(!is.na(Continent_Region)) %>%
group_by(ID_file, Continent_Region) %>%
summarize(Nb_duplicated_genes = n())
#filter(Nb_duplicated_genes < 400) %>%
ggplot(duplicated_genes, aes(x = Continent_Region,
y = Nb_duplicated_genes,
fill = Continent_Region)) +
geom_violin() +
geom_boxplot(color = "white", width=0.08, outlier.shape = NA) +
Fill_Continent +
theme_bw() +
theme(legend.position = "none",
axis.text.x = element_text(size = 10, angle = 20, hjust = 1)) +
labs( x = "", y = "Number of genes",
title = str_wrap(paste0("How many genes have a copy number ",
"of two?"), width = 60),
subtitle = "The estimation of the copy number is based on depth of coverage.")
depth_per_gene %>%
filter(Median_core_depth >= 10 & Median_core_depth <= 50) %>%
filter(Rounded_norm > 1) %>%
filter(Rounded_norm <= 2) %>%
filter(!is.na(Continent_Region)) %>%
group_by(ID_file, Continent_Region) %>%
summarize(Nb_duplicated_genes = n())
## # A tibble: 300 x 3
## # Groups: ID_file [300]
## ID_file Continent_Region Nb_duplicated_genes
## <chr> <chr> <int>
## 1 ST01AUS_1A4 Oceania 54
## 2 ST01AUS_1A5 Oceania 36
## 3 ST01AUS_1A6 Oceania 71
## 4 ST01AUS_1A9 Oceania 45
## 5 ST01AUS_1B1 Oceania 80
## 6 ST01AUS_1B2 Oceania 69
## 7 ST01AUS_1B7 Oceania 77
## 8 ST01AUS_1B8 Oceania 66
## 9 ST01AUS_1C1 Oceania 59
## 10 ST01AUS_1C2 Oceania 50
## # … with 290 more rows
depth_per_gene %>%
filter(Median_core_depth >= 10 & Median_core_depth <= 50) %>%
dplyr::select(ID_file, Gene_ID, Rounded_norm) %>%
pivot_wider(names_from = ID_file, values_from = Rounded_norm) %>%
write_tsv(paste0(VAR_dir,"2_Depth_per_gene/Rounded_depth_per_gene_per_sample.tab"))
Just creating a file to use on the cluster
out = open(r.sel_dir + "Only_unique_copies.tab", "w")
with open(r.VAR_dir + "2_Depth_per_gene/Rounded_depth_per_gene_per_sample.tab") as depth_input :
for i, line in enumerate(depth_input) :
if i == 0 :
header = line.strip().split("\t")
else :
line_sp = line.strip().split("\t")
gene = line_sp[0]
sample_list = [gene]
for sample, CNV in zip(header[1:], line_sp[1:]) :
if CNV == "1" :
sample_list.append(sample)
p=out.write("\t".join(sample_list) + "\n")
out.close()
print("I'm done.")
## I'm done.
rm(depth_per_gene)
Question: Is the world-wide population of Z.tritici structured? If so, is it structured according to geography, host or time (or any other relevant info we hopefully have)?
Previous genomic work has shown very clear structure between populations of Z.tritici. However,the sampling was extremely heterogeneous. With a more geographically even sampling, do we also observe a clear-cut structure?
Methods: Analyses to create would be:
As a first method to investigate the population structure of Z.tritici at the world-wide scale, I chose to do a principal component analysis based on a subset of the SNPs. This PCA confirms previous results of a geographically structured species. Oceania emerges quite distincly as three separate clusters: one in New-Zealand and two Australian (see below for a more in-depth analysis of this pattern). North-America is also quite serapate. The distinction between the rest of the geographical location is not clear in this PCA, although PC3 might show a slight differenciation between Europe and the Middle-East.
vcftools --vcf ${VCFDIR}${VCFNAME}.recode.vcf \
--keep $ZTLIST.txt \
--non-ref-ac-any 1 \
--remove-filtered-all --recode --recode-INFO-all \
--out ${VCFDIR}${VCFNAME}.pass
snpgdsVCF2GDS(paste0(vcf_dir, vcf_name, ".pass.recode.vcf"),
paste0(PopStr_dir, vcf_name, ".pass.recode.gds"), method="biallelic.only")
genofile <- snpgdsOpen(paste0(PopStr_dir, vcf_name, ".pass.recode.gds"))
pca <-snpgdsPCA(genofile)
snpgdsClose(genofile)
pca2 = as.tibble(pca$eigenvect) %>% dplyr::select(V1:V4)
colnames(pca2) = c("PC1", "PC2", "PC3", "PC4")
pca2 = pca2 %>%
dplyr::mutate(sample_id = pca$sample.id ) %>%
dplyr::right_join(., Zt_meta, by = c("sample_id" = "ID_file")) %>%
unite(sample_id, Country, col = "for_display", remove = F)
as.tibble(pca$eigenval[!is.na(pca$eigenval)]) %>%
ggplot(aes(x = c(1:length(pca$eigenval[!is.na(pca$eigenval)])),
y =value)) + geom_point() +
theme_bw()
eigen_sum = sum(pca$eigenval[!is.na(pca$eigenval)])
p = ggplot(pca2, aes(x = PC1, y= PC2, text = for_display)) +
geom_point(aes(color = Continent_Region)) +
labs(x = paste0("PC 1 (", round(pca$eigenval[1]*100/eigen_sum, 2), "%)"),
y = paste0("PC 2 (", round(pca$eigenval[2]*100/eigen_sum, 2), "%)")) +
Color_Continent +
theme_bw()
#p
ggplotly(p)
q = ggplot(pca2, aes(x = PC3, y= PC4, text = for_display)) +
geom_point(aes(color = Continent_Region)) +
labs(x = paste0("PC 3 (", round(pca$eigenval[3]*100/eigen_sum, 2), "%)"),
y = paste0("PC 4 (", round(pca$eigenval[4]*100/eigen_sum, 2), "%)")) +
Color_Continent +
theme_bw()
#q
ggplotly(q)
p = ggpairs(pca2, columns = c(1:4),
ggplot2::aes(col=Continent_Region, fill = Continent_Region, alpha = 0.6),
title = "PCA based thinned SNPs",
upper = list(continuous = "points", combo = "box_no_facet"))
for(i in 1:p$nrow) {
for(j in 1:p$ncol){
p[i,j] <- p[i,j] + theme_bw() + Color_Continent +Fill_Continent
}
}
p
In order to investiage more precisely the population structure in Europe, the Middle-East and America, I removed the Oceanian isolates and recreated the PCA. This highlights the different between the North America samples and the rest of the samples, in Europe, South America and the Middle-East. These last locations do not separate as distinct clusters but as a gradient, visible on PC2 (from Europe to the Middle-East) and PC4.
Zt_meta %>%
filter(is.na(Duplicate_Clone_LowQual_Flags) | Duplicate_Clone_LowQual_Flags != "DUPLICATE_remove") %>%
#filter(is.na(Duplicate_Clone_LowQual_Flags) | Duplicate_Clone_LowQual_Flags != "Publication_embargo") %>%
inner_join(., depth %>% filter(MEAN_DEPTH >= 5)) %>%
filter(Continent_Region != "Oceania") %>%
dplyr::select(ID_file) %>% write_delim(paste0(PopStr_dir, "Zt_list_wo_oceania.txt"))
vcftools --vcf ${VCFDIR}${VCFNAME}.recode.vcf \
--keep ${POPSTR}Zt_list_wo_oceania.txt \
--remove-filtered-all --recode --recode-INFO-all \
--non-ref-ac-any 1 \
--out ${POPSTR}${VCFNAME}.pass.wo_oceania
snpgdsVCF2GDS(paste0(PopStr_dir, vcf_name, ".pass.wo_oceania.recode.vcf"),
paste0(PopStr_dir, vcf_name, ".pass.wo_oceania.recode.gds"),
method="biallelic.only")
genofile <- snpgdsOpen(paste0(PopStr_dir, vcf_name, ".pass.wo_oceania.recode.gds"))
pca <-snpgdsPCA(genofile)
snpgdsClose(genofile)
pca2 = as.tibble(pca$eigenvect) %>% dplyr::select(V1:V4) %>%
mutate(sample_id = pca$sample.id ) %>%
right_join(., Zt_meta, by = c("sample_id" = "ID_file")) %>%
unite(sample_id, Country, col = "for_display", remove = F)
eigen_sum = sum(pca$eigenval[!is.na(pca$eigenval)])
p = ggplot(pca2, aes(x = V1, y= V2, text = for_display)) +
geom_point(aes(color = Continent_Region)) +
labs(x = paste0("PC 1 (", round(pca$eigenval[1]*100/eigen_sum, 2), "%)"),
y = paste0("PC 2 (", round(pca$eigenval[2]*100/eigen_sum, 2), "%)")) +
Color_Continent +
theme_bw()
#p
ggplotly(p)
q = ggplot(pca2, aes(x = V3, y= V4, text = for_display)) +
geom_point(aes(color = Continent_Region)) +
labs(x = paste0("PC 3 (", round(pca$eigenval[3]*100/eigen_sum, 2), "%)"),
y = paste0("PC 4 (", round(pca$eigenval[4]*100/eigen_sum, 2), "%)")) +
Color_Continent +
theme_bw()
#q
ggplotly(q)
This subsection was created to satisfy my own curiosity, but I guess this is what Megan McDonald is doing at the moment? I might remember wrongly from her presentation, but there was something about a resistance to a new fungicide between the two Australian collection.
Zt_meta %>%
filter(is.na(Duplicate_Clone_LowQual_Flags) | Duplicate_Clone_LowQual_Flags != "DUPLICATE_remove") %>%
filter(is.na(Duplicate_Clone_LowQual_Flags) | Duplicate_Clone_LowQual_Flags != "Publication_embargo") %>%
inner_join(., depth %>% filter(MEAN_DEPTH >= 5)) %>%
filter(Continent_Region == "Oceania") %>%
dplyr::select(ID_file) %>% write_delim(paste0(PopStr_dir, "Zt_list_only_oceania.txt"))
vcftools --vcf ${VCFDIR}$VCFNAME.recode.vcf \
--keep ${POPSTR}Zt_list_only_oceania.txt \
--remove-filtered-all --recode --recode-INFO-all \
--non-ref-ac-any 1 \
--out ${POPSTR}$VCFNAME.pass.only_oceania
snpgdsVCF2GDS(paste0(PopStr_dir, vcf_name, ".pass.only_oceania.recode.vcf"),
paste0(PopStr_dir, vcf_name, ".pass.only_oceania.recode.gds"),
method="biallelic.only")
genofile <- snpgdsOpen(paste0(PopStr_dir, vcf_name, ".pass.only_oceania.recode.gds"))
pca <-snpgdsPCA(genofile)
snpgdsClose(genofile)
pca2 = as.tibble(pca$eigenvect) %>% dplyr::select(V1:V4) %>%
mutate(sample_id = pca$sample.id ) %>%
inner_join(., Zt_meta, by = c("sample_id" = "ID_file")) %>%
unite(sample_id, Country, col = "for_display", remove = F)
eigen_sum = sum(pca$eigenval[!is.na(pca$eigenval)])
p = ggplot(pca2, aes(x = V1, y= V2, text = for_display)) +
geom_point(aes(color = as.character(Date_Year), shape = Country)) +
labs(x = paste0("PC 1 (", round(pca$eigenval[1]*100/eigen_sum, 2), "%)"),
y = paste0("PC 2 (", round(pca$eigenval[2]*100/eigen_sum, 2), "%)")) +
scale_color_manual(values = blues) +
theme_bw()
#p
ggplotly(p)
q = ggplot(pca2, aes(x = V3, y= V4, text = for_display)) +
geom_point(aes(color = as.character(Date_Year), shape = Country)) +
labs(x = paste0("PC 3 (", round(pca$eigenval[3]*100/eigen_sum, 2), "%)"),
y = paste0("PC 4 (", round(pca$eigenval[4]*100/eigen_sum, 2), "%)")) +
scale_color_manual(values = blues) +
theme_bw()
#q
ggplotly(q)
The clustering here is done by using the snmf method from the LEA R package (http://membres-timc.imag.fr/Olivier.Francois/LEA/) on the same subset of SNPs as the PCA, but without any missing data. I ran the analysis for a K (number of cluster inferred) ranging from 1 to 15 and with 10 repeats for each K.
vcftools --vcf ${VCFDIR}$VCFNAME.recode.vcf \
--keep $ZTLIST.txt \
--remove-filtered-all --extract-FORMAT-info GT \
--max-missing 1.0 --min-alleles 2 --max-alleles 2 \
--maf 0.05 \
--out ${POPSTR}$VCFNAME.pass_noNA
cat ${POPSTR}$VCFNAME.pass_noNA.GT.FORMAT | cut -f 3- \
> ${POPSTR}$VCFNAME.pass_noNA.GT.FORMAT2
cat ${POPSTR}$VCFNAME.pass_noNA.GT.FORMAT | cut -f 1,2 \
> ${POPSTR}$VCFNAME.pass_noNA.GT.FORMAT.pos
head -n1 ${POPSTR}$VCFNAME.pass_noNA.GT.FORMAT2 | gsed "s/\t/\n/g" \
> ${POPSTR}$VCFNAME.pass_noNA.ind
gsed "s/\t//g" ${POPSTR}$VCFNAME.pass_noNA.GT.FORMAT2 | tail -n +2 \
> ${POPSTR}$VCFNAME.pass_noNA.geno
project = snmf(paste0(PopStr_dir, vcf_name, ".pass_noNA.geno"), K=1:15, entropy = TRUE,
repetitions = 10, project = "new", ploidy = 1)
First, let’s look at the cross-validation results. sNMF estimates an entropy criterion which evaluates the quality of fit of the model to the data, potentially helping to find the best number of ancestral populations.
project = load.snmfProject(paste0(PopStr_dir, vcf_name, ".pass_noNA.snmfProject"))
plot(project, col = "goldenrod", pch = 19, cex = 1.2)
In the case of our analysis, we do not have a very clear-cut minimum value for the cross-entropy criterion value. There is however a plateau starting from K=6. I chose to represent as barplots the best run for each value of K (as defined as the one with the lowest cross-entropy value). This seem to confirm the choice of a reasonable K value at 6 since no new major cluster seem to appear in the at higher values. It is possible again that the Oceanian samples would separate more as in the PCA at higher values of K.
The results from the PCA and from the clutering analysis are coherent: Oceania separates into 3 clusters (one in New_Zealand, and two in Australia) and the North American isolates form a separate cluster. Higher K values also distinguish a Middle-Eastern/African cluster from the European cluster, representing the two extreme points of the gradient found between these populations in the PCA.
indv_snmf = read_tsv(paste0(PopStr_dir, vcf_name, ".pass_noNA.ind"), col_names = F)
names(indv_snmf) = "Sample"
datalist = list()
for (i in c(2, 3, 4, 5, 6, 7, 8)){
best = which.min(cross.entropy(project, K = i))
temp = as.data.frame(Q(project, i, best))
temp= cbind(indv_snmf, temp)
temp = temp %>%
gather("Cluster", "Admix_coef", -"Sample") %>%
mutate(K=i)
datalist[[i]] = as.tibble(temp)
}
snmf_results_per_K = bind_rows(datalist) %>%
inner_join(., Zt_meta, by = c("Sample" = "ID_file")) %>%
unite(Continent_Region, Country, col = "for_display", remove = F) %>%
mutate(Sample = fct_reorder(Sample, Date_Year)) %>%
mutate(Sample = fct_reorder(Sample, Country)) %>%
mutate(Sample = fct_reorder(Sample, Continent_Region))
p = ggplot(snmf_results_per_K, aes(x = Sample, y = Admix_coef, fill = Cluster, text = for_display)) +
geom_bar(position = "stack", stat = "identity") + facet_grid(K~.) +
theme_bw() + theme(axis.title = element_blank(),
axis.text.x = element_blank(),
legend.title = element_blank()) #element_text(angle = 60, hjust = 1))
ggplotly(p)
K_list = c(1:15)
afiles = character(length(K_list))
for (i in K_list){
best = which.min(cross.entropy(project, K = i))
afiles[i] = Sys.glob(paste0(PopStr_dir, vcf_name, ".pass_noNA.snmf/K",i, "/run", best, "/*Q"))
}
# create a qlist
qlist <- readQBasic(afiles)
al_qlist = alignK(qlist)
lab_set = inner_join(indv_snmf, Zt_meta, by = c("Sample" = "ID_file")) %>%
dplyr::select(Continent_Region, Country)
from = 4
up_to = 7
p1 <- plotQ(alignK(qlist[from:up_to], type = "across"),
imgoutput="join",
returnplot=T,exportplot=F,
quiet=T,basesize=11,
splab= paste0("K=", K_list[from:up_to]),
grplab=lab_set, ordergrp=T, grplabangle = 40, grplabheight = 2, grplabsize = 2)
#TODO: add own colors clustercol=c("#A6CEE3", "#3F8EAA", "#79C360", "#E52829", "#FDB762",)
grid.arrange(p1$plot[[1]])
chosen_K = 6
chosen_threshold = 0.8
snmf_results_per_K %>%
filter(K == chosen_K) %>%
dplyr::group_by(Continent_Region, for_display, Cluster) %>%
dplyr::summarize(Ave_admix_coef = round(mean(Admix_coef), 2)) %>%
dplyr::filter(Ave_admix_coef >= 0.02) %>%
ggplot(aes(x = for_display, y = Cluster,
size = Ave_admix_coef, color = Continent_Region)) +
geom_point(alpha = 0.5) + Color_Continent+ theme_bw() +
theme(axis.text.x = element_text(angle = 60, hjust = 1)) +
labs(x = "", title = "Average admixture coefficient per country")
#Looking at individuals with admixture coef higher than the threshold defined above.
high_anc_coef_snmf = snmf_results_per_K %>%
filter(K == chosen_K) %>%
filter(Admix_coef > chosen_threshold)
##Table
kable(high_anc_coef_snmf %>%
dplyr::group_by(for_display, Cluster) %>%
dplyr::count() %>%
pivot_wider(names_from = Cluster, values_from = n, values_fill = list(n = 0)))
| for_display | V1 | V6 | V2 | V3 | V5 | V4 |
|---|---|---|---|---|---|---|
| Africa_Algeria | 1 | 0 | 0 | 0 | 0 | 0 |
| Africa_Ethiopia | 1 | 0 | 0 | 0 | 0 | 0 |
| Africa_Kenya | 0 | 1 | 0 | 0 | 0 | 0 |
| Africa_Morocco | 1 | 0 | 0 | 0 | 0 | 0 |
| Africa_Tunisia | 3 | 2 | 0 | 0 | 0 | 0 |
| Europe_Czech Republic | 0 | 2 | 0 | 0 | 0 | 0 |
| Europe_Denmark | 0 | 9 | 0 | 0 | 0 | 0 |
| Europe_France | 1 | 123 | 0 | 0 | 0 | 0 |
| Europe_Germany | 0 | 5 | 0 | 0 | 0 | 0 |
| Europe_Ireland | 0 | 3 | 0 | 0 | 0 | 0 |
| Europe_Italy | 1 | 0 | 0 | 0 | 0 | 0 |
| Europe_Latvia | 0 | 2 | 0 | 0 | 0 | 0 |
| Europe_Netherlands | 0 | 5 | 0 | 0 | 0 | 0 |
| Europe_Sweden | 0 | 6 | 0 | 0 | 0 | 0 |
| Europe_Switzerland | 0 | 183 | 0 | 0 | 0 | 0 |
| Europe_UK | 0 | 8 | 0 | 0 | 0 | 0 |
| Middle East_Iran | 11 | 0 | 0 | 0 | 0 | 0 |
| Middle East_Israel | 32 | 0 | 0 | 0 | 0 | 0 |
| Middle East_Syria | 4 | 0 | 0 | 0 | 0 | 0 |
| Middle East_Turkey | 5 | 0 | 0 | 0 | 0 | 0 |
| Middle East_Yemen | 1 | 0 | 0 | 0 | 0 | 0 |
| North America_Canada | 0 | 0 | 2 | 0 | 0 | 0 |
| North America_United States | 0 | 0 | 163 | 0 | 0 | 0 |
| Oceania_Australia | 0 | 0 | 0 | 56 | 36 | 0 |
| Oceania_New Zealand | 0 | 0 | 0 | 0 | 0 | 29 |
##Pretty plot
p_cluster = high_anc_coef_snmf %>%
dplyr::group_by(Continent_Region, for_display, Cluster) %>%
dplyr::count() %>%
ggplot(aes(x = for_display, y = Cluster,
size = n, color = Continent_Region)) +
geom_point(alpha = 0.5) + Color_Continent+ theme_bw() +
theme(axis.text.x = element_text(angle = 60, hjust = 1)) +
labs(x = "", title = "Number of genotypes with admix coef > 0.8")
p_cluster
##Writing out tables for later
high_anc_coef_snmf %>% dplyr::select(Sample) %>%
write_tsv(., path = paste0(PopStr_dir, vcf_name, ".pass_noNA.high_anc_coef_snmf.ind"),
col_names = F)
high_anc_coef_snmf %>%
write_tsv(., path = paste0(PopStr_dir, vcf_name, ".pass_noNA.high_anc_coef_snmf.tsv"),
col_names = T)
Note: I am not entirely sure about whether the high density of isolates from specific fields could bias the results. I remember it was the case with STRUCTURE, not sure if it is also the case of sNMF. To be checked.
In the steps before, I have learned about population history indirectly by inferring genetic populations from the genomic data. The relationship between the population and the underlying demography is not explicit in these however. It is possible however to infer splits between populations and create a population tree. Here, I use treemix, which takes into account the possibility of gene flow between populations and indeed test of it in the process of creating a population tree.
Because the populations in the clustering were not perfectly distinct from one another, I start with “discretized” populations by choosing only the isolates with high ancestry in one of the sNMF clusters.
vcftools --vcf ${VCFDIR}$VCFNAME.recode.vcf \
--keep ${POPSTR}$VCFNAME.pass_noNA.high_anc_coef_snmf.ind \
--remove-filtered-all --extract-FORMAT-info GT \
--max-missing 1.0 --min-alleles 2 --max-alleles 2 \
--maf 0.05 \
--out ${POPSTR}$VCFNAME.pass_noNA.high_anc_coef_snmf
cat ${POPSTR}$VCFNAME.pass_noNA.high_anc_coef_snmf.GT.FORMAT | cut -f 3- \
> ${POPSTR}$VCFNAME.pass_noNA.high_anc_coef_snmf.GT.FORMAT2
from collections import defaultdict
#For each isolate, store its pop (as in sampling site) in a dictionary
dict_pop = dict(zip(r.high_anc_coef_snmf["Sample"],
r.high_anc_coef_snmf["Cluster"]))
#Keep a list of the pop names/coordinates to write in the same order later
all_pops = sorted(list(set(r.high_anc_coef_snmf["Cluster"])))
out_name = r.PopStr_dir + r.vcf_name + ".pass_noNA.high_anc_coef_snmf.treemix"
out = open(out_name, "w")
shutup = out.write(" ".join(all_pops) + "\n")
with open(r.PopStr_dir + r.vcf_name + ".pass_noNA.high_anc_coef_snmf.GT.FORMAT2", "r") as input_snps :
for i, snp in enumerate(input_snps) :
#Setting two dictionaries with values at 0
dict_snp0 = defaultdict(int)
dict_snp1 = defaultdict(int)
Lets_write = True
#The first line is the name of the isolates
if i == 0 :
indv = snp.strip().split("\t")
Lets_write = False
else :
#Keeping isolate name and allelic value together
alleles = zip(indv, snp.strip().split("\t"))
#...and counting the O and 1 based on the pop
for ind, allele in alleles:
if allele == "0" :
dict_snp0[dict_pop[ind]] += 1
elif allele == "1" :
dict_snp1[dict_pop[ind]] += 1
else :
print("Only biallelic please!!!!")
Lets_write = False
#If I have not found anything weird, I will write the result to the output file.
if Lets_write :
shutup = out.write(" ".join([",".join([str(dict_snp0[pop]), str(dict_snp1[pop])]) for pop in all_pops]) + "\n")
out.close()
gzip ${POPSTR}$VCFNAME.pass_noNA.high_anc_coef_snmf.treemix
treemix \
-i ${POPSTR}$VCFNAME.pass_noNA.high_anc_coef_snmf.treemix.gz \
-o ${POPSTR}$VCFNAME.pass_noNA.high_anc_coef_snmf.treemix.out \
-m 10 -root V1
source("/Users/feurtey/Documents/Software/treemix-1.13/src/plotting_funcs.R")
t = plot_tree(paste0(PopStr_dir, vcf_name, ".pass_noNA.high_anc_coef_snmf.treemix.out"))
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10
## 1 0 <NA> NOT_ROOT NOT_MIG NOT_TIP 32 1 1 51 1
## 2 1 V5 NOT_ROOT NOT_MIG TIP 0 NA NA NA NA
## 3 2 <NA> NOT_ROOT MIG NOT_TIP 0 1 NA NA NA
## 4 3 V3 NOT_ROOT NOT_MIG TIP 58 NA NA NA NA
## 5 4 V2 NOT_ROOT NOT_MIG TIP 32 NA NA NA NA
## 6 15 V1 NOT_ROOT NOT_MIG TIP 53 NA NA NA NA
## 7 16 <NA> NOT_ROOT NOT_MIG NOT_TIP 53 58 2 32 3
## 8 31 V6 NOT_ROOT NOT_MIG TIP 58 NA NA NA NA
## 9 32 <NA> NOT_ROOT NOT_MIG NOT_TIP 16 0 2 4 1
## 10 51 V4 NOT_ROOT NOT_MIG TIP 0 NA NA NA NA
## 11 53 <NA> ROOT NOT_MIG NOT_TIP 53 15 1 16 5
## 12 58 <NA> NOT_ROOT NOT_MIG NOT_TIP 16 31 1 3 1
## 13 77 <NA> NOT_ROOT MIG NOT_TIP 0 51 NA NA NA
## 14 147 <NA> NOT_ROOT MIG NOT_TIP 53 15 NA NA NA
## 15 213 <NA> NOT_ROOT MIG NOT_TIP 0 1 NA NA NA
## 16 128 <NA> NOT_ROOT MIG NOT_TIP 32 4 NA NA NA
## 17 272 <NA> NOT_ROOT MIG NOT_TIP 58 3 NA NA NA
## V11
## 1 (V5:0.107996,V4:0.0728333):0.0169245
## 2 V5:0.107996
## 3 <NA>
## 4 V3:0.228824
## 5 V2:0.0388698
## 6 V1:0.0223374
## 7 ((V6:0.000589609,V3:0.228824):0.00303852,((V5:0.107996,V4:0.0728333):0.0169245,V2:0.0388698):0.00253829):0.0217937
## 8 V6:0.000589609
## 9 ((V5:0.107996,V4:0.0728333):0.0169245,V2:0.0388698):0.00253829
## 10 V4:0.0728333
## 11 (V1:0.0223374,((V6:0.000589609,V3:0.228824):0.00303852,((V5:0.107996,V4:0.0728333):0.0169245,V2:0.0388698):0.00253829):0.0217937);
## 12 (V6:0.000589609,V3:0.228824):0.00303852
## 13 <NA>
## 14 <NA>
## 15 <NA>
## 16 <NA>
## 17 <NA>
## x y ymin ymax
## 1 0.04125649 0.33333333 0.1666667 0.5000000
## 2 0.11408983 0.41666667 0.3333333 0.5000000
## 3 0.10080531 NA 0.1666667 0.3333333
## 4 0.11594056 0.58333333 0.5000000 0.6666667
## 5 0.06313656 0.08333333 0.0000000 0.1666667
## 6 0.02233551 0.91666667 0.8333333 1.0000000
## 7 0.02179370 0.50000000 0.0000000 0.8333333
## 8 0.02485376 0.75000000 0.6666667 0.8333333
## 9 0.02433199 0.16666667 0.0000000 0.5000000
## 10 0.11408983 0.25000000 0.1666667 0.3333333
## 11 0.00000000 0.83333333 0.0000000 1.0000000
## 12 0.02457554 0.66666667 0.5000000 0.8333333
## 13 0.11374769 NA 0.1666667 0.3333333
## 14 0.02179191 NA 0.0000000 0.8333333
## 15 0.11408983 NA 0.1666667 0.3333333
## 16 0.02454141 NA 0.0000000 0.1666667
## 17 0.11412284 NA 0.5000000 0.6666667
## [1] "0.595458 0.04125649 0.114089833"
## [1] "0.995302 0.04125649 0.114089833"
## [1] "0.975662 0 0.0223355133752591"
## [1] "0.845105 0.04125649 0.114089833"
## [1] "0.00539676 0.02433199 0.0631365648855101"
## [1] "0.980105 0.0245755369176556 0.115940560291508"
## [1] 0.11408983 0.11594056 0.06313656 0.02233551 0.02485376 0.11408983
## [1] 0.003
## [1] "mse 0.000523137138888889"
p_cluster
Along with the treemix software, are distribution the software treepop and fourpop. These measure f3 and f4 statistics which test for treeness in population trees.
The three-population test is of the form f3(A;B;C), where a significantly negative value of the f3 statistic implies that population A is admixed. The output is four columns: populations | f3 statistic | standard error | Z-score
threepop -i ${POPSTR}$VCFNAME.pass_noNA.high_anc_coef_snmf.treemix.gz -k 500
## npop:6 nsnp:11853
## Estimating covariance matrix in 23 blocks of size 500
## Estimating f_3 in 23 blocks of size 500
## total_nsnp 11853 nsnp 11853
## V1;V2,V3 0.0451248 0.00204994 22.0127
## V2;V1,V3 0.0403089 0.00204847 19.6776
## V3;V1,V2 0.103211 0.00502406 20.5434
## Estimating f_3 in 23 blocks of size 500
## total_nsnp 11853 nsnp 11853
## V1;V2,V4 0.0466498 0.00228677 20.3999
## V2;V1,V4 0.0387839 0.00193089 20.086
## V4;V1,V2 0.0897219 0.00566013 15.8515
## Estimating f_3 in 23 blocks of size 500
## total_nsnp 11853 nsnp 11853
## V1;V2,V5 0.0449815 0.00207906 21.6355
## V2;V1,V5 0.0404522 0.00192851 20.9759
## V5;V1,V2 0.11394 0.0034061 33.4519
## Estimating f_3 in 23 blocks of size 500
## total_nsnp 11853 nsnp 11853
## V1;V2,V6 0.045036 0.00181266 24.8452
## V2;V1,V6 0.0403977 0.00165836 24.3601
## V6;V1,V2 0.00105419 0.00086768 1.21495
## Estimating f_3 in 23 blocks of size 500
## total_nsnp 11853 nsnp 11853
## V1;V3,V4 0.0537752 0.00300243 17.9106
## V3;V1,V4 0.0945609 0.00492194 19.2121
## V4;V1,V3 0.0825965 0.00618335 13.3579
## Estimating f_3 in 23 blocks of size 500
## total_nsnp 11853 nsnp 11853
## V1;V3,V5 0.0726603 0.00290698 24.9951
## V3;V1,V5 0.0756758 0.00739709 10.2305
## V5;V1,V3 0.0862617 0.00232528 37.0973
## Estimating f_3 in 23 blocks of size 500
## total_nsnp 11853 nsnp 11853
## V1;V3,V6 0.045901 0.0021903 20.9565
## V3;V1,V6 0.102435 0.00498495 20.5489
## V6;V1,V3 0.000189219 0.000821908 0.230219
## Estimating f_3 in 23 blocks of size 500
## total_nsnp 11853 nsnp 11853
## V1;V4,V5 0.0611872 0.00316706 19.3199
## V4;V1,V5 0.0751845 0.00758605 9.91088
## V5;V1,V4 0.0977347 0.00375752 26.0104
## Estimating f_3 in 23 blocks of size 500
## total_nsnp 11853 nsnp 11853
## V1;V4,V6 0.0476207 0.00203434 23.4085
## V4;V1,V6 0.0887511 0.00532023 16.6818
## V6;V1,V4 -0.00153048 0.000778229 -1.96662
## Estimating f_3 in 23 blocks of size 500
## total_nsnp 11853 nsnp 11853
## V1;V5,V6 0.0437645 0.00196063 22.3216
## V5;V1,V6 0.115157 0.00327818 35.1285
## V6;V1,V5 0.00232569 0.00106883 2.17591
## Estimating f_3 in 23 blocks of size 500
## total_nsnp 11853 nsnp 11853
## V2;V3,V4 0.0474343 0.00223 21.271
## V3;V2,V4 0.096086 0.00552364 17.3954
## V4;V2,V3 0.0810715 0.00612219 13.2422
## Estimating f_3 in 23 blocks of size 500
## total_nsnp 11853 nsnp 11853
## V2;V3,V5 0.0679877 0.00379085 17.9347
## V3;V2,V5 0.0755325 0.00733645 10.2955
## V5;V2,V3 0.086405 0.00227068 38.0525
## Estimating f_3 in 23 blocks of size 500
## total_nsnp 11853 nsnp 11853
## V2;V3,V6 0.0411739 0.0018313 22.4834
## V3;V2,V6 0.102346 0.0049528 20.6644
## V6;V2,V3 0.000278042 0.000613192 0.453434
## Estimating f_3 in 23 blocks of size 500
## total_nsnp 11853 nsnp 11853
## V2;V4,V5 0.0549896 0.00290761 18.9123
## V4;V2,V5 0.0735161 0.0077444 9.49281
## V5;V2,V4 0.0994031 0.00362274 27.4386
## Estimating f_3 in 23 blocks of size 500
## total_nsnp 11853 nsnp 11853
## V2;V4,V6 0.0413686 0.00177868 23.258
## V4;V2,V6 0.0871372 0.00546256 15.9517
## V6;V2,V4 8.33753e-05 0.000764 0.10913
## Estimating f_3 in 23 blocks of size 500
## total_nsnp 11853 nsnp 11853
## V2;V5,V6 0.0391807 0.00173726 22.5531
## V5;V2,V6 0.115212 0.00346718 33.2293
## V6;V2,V5 0.00227121 0.000924398 2.45696
## Estimating f_3 in 23 blocks of size 500
## total_nsnp 11853 nsnp 11853
## V3;V4,V5 0.0830879 0.00722354 11.5024
## V4;V3,V5 0.0940696 0.00791809 11.8803
## V5;V3,V4 0.0788496 0.00304438 25.9001
## Estimating f_3 in 23 blocks of size 500
## total_nsnp 11853 nsnp 11853
## V3;V4,V6 0.0962806 0.00493025 19.5286
## V4;V3,V6 0.0808768 0.00599492 13.4909
## V6;V3,V4 0.00634375 0.00132474 4.78866
## Estimating f_3 in 23 blocks of size 500
## total_nsnp 11853 nsnp 11853
## V3;V5,V6 0.0735394 0.0071297 10.3145
## V5;V3,V6 0.0883982 0.00224008 39.4621
## V6;V3,V5 0.029085 0.00281421 10.335
## Estimating f_3 in 23 blocks of size 500
## total_nsnp 11853 nsnp 11853
## V4;V5,V6 0.0713283 0.00717146 9.94614
## V5;V4,V6 0.101591 0.00373716 27.184
## V6;V4,V5 0.0158923 0.00226846 7.00577
The four-population test is of the form f4(A;B;C;D), where a significantly negative value of the f4 statistic implies gene flow in the tree. The output is four columns: populations | f4 statistic | standard error | Z-score
fourpop -i ${POPSTR}$VCFNAME.pass_noNA.high_anc_coef_snmf.treemix.gz -k 500
## npop:6 nsnp:11853
## Estimating covariance matrix in 23 blocks of size 500
## Estimating f_4 in 23 blocks of size 500
## total_nsnp 11853 nsnp 11853
## V1,V2;V3,V4 0.00152503 0.0012688 1.20195
## V1,V3;V2,V4 0.00865038 0.00161996 5.33986
## V1,V4;V2,V3 0.00712535 0.00201599 3.53442
## Estimating f_4 in 23 blocks of size 500
## total_nsnp 11853 nsnp 11853
## V1,V2;V3,V5 -0.000143303 0.0011177 -0.128212
## V1,V3;V2,V5 0.0275355 0.00285563 9.64252
## V1,V5;V2,V3 0.0276788 0.00270388 10.2367
## Estimating f_4 in 23 blocks of size 500
## total_nsnp 11853 nsnp 11853
## V1,V2;V3,V6 -8.88239e-05 0.000884867 -0.100381
## V1,V3;V2,V6 0.000776149 0.000851147 0.911885
## V1,V6;V2,V3 0.000864973 0.00107002 0.808374
## Estimating f_4 in 23 blocks of size 500
## total_nsnp 11853 nsnp 11853
## V1,V2;V4,V5 -0.00166833 0.00133229 -1.25223
## V1,V4;V2,V5 0.0145374 0.0028608 5.08158
## V1,V5;V2,V4 0.0162057 0.00262948 6.1631
## Estimating f_4 in 23 blocks of size 500
## total_nsnp 11853 nsnp 11853
## V1,V2;V4,V6 -0.00161385 0.00119698 -1.34827
## V1,V4;V2,V6 0.000970816 0.00128514 0.755418
## V1,V6;V2,V4 0.00258467 0.000893807 2.89176
## Estimating f_4 in 23 blocks of size 500
## total_nsnp 11853 nsnp 11853
## V1,V2;V5,V6 5.44793e-05 0.00112154 0.0485756
## V1,V5;V2,V6 -0.00121702 0.00080188 -1.51771
## V1,V6;V2,V5 -0.0012715 0.000993483 -1.27984
## Estimating f_4 in 23 blocks of size 500
## total_nsnp 11853 nsnp 11853
## V1,V3;V4,V5 0.0188851 0.0029896 6.31693
## V1,V4;V3,V5 0.00741206 0.00245444 3.01985
## V1,V5;V3,V4 -0.011473 0.0030933 -3.709
## Estimating f_4 in 23 blocks of size 500
## total_nsnp 11853 nsnp 11853
## V1,V3;V4,V6 -0.00787423 0.00160172 -4.91611
## V1,V4;V3,V6 -0.00615453 0.0018872 -3.2612
## V1,V6;V3,V4 0.0017197 0.000806509 2.13227
## Estimating f_4 in 23 blocks of size 500
## total_nsnp 11853 nsnp 11853
## V1,V3;V5,V6 -0.0267593 0.00299377 -8.93834
## V1,V5;V3,V6 -0.0288958 0.00255045 -11.3297
## V1,V6;V3,V5 -0.00213647 0.000902038 -2.36849
## Estimating f_4 in 23 blocks of size 500
## total_nsnp 11853 nsnp 11853
## V1,V4;V5,V6 -0.0135666 0.00285146 -4.75776
## V1,V5;V4,V6 -0.0174228 0.00239605 -7.27146
## V1,V6;V4,V5 -0.00385617 0.000980944 -3.93108
## Estimating f_4 in 23 blocks of size 500
## total_nsnp 11853 nsnp 11853
## V2,V3;V4,V5 0.0205534 0.00285412 7.20133
## V2,V4;V3,V5 0.00755536 0.00254648 2.96698
## V2,V5;V3,V4 -0.0129981 0.00316866 -4.10207
## Estimating f_4 in 23 blocks of size 500
## total_nsnp 11853 nsnp 11853
## V2,V3;V4,V6 -0.00626037 0.00142928 -4.3801
## V2,V4;V3,V6 -0.00606571 0.00143319 -4.2323
## V2,V6;V3,V4 0.000194667 0.000975707 0.199514
## Estimating f_4 in 23 blocks of size 500
## total_nsnp 11853 nsnp 11853
## V2,V3;V5,V6 -0.0268138 0.00306488 -8.74872
## V2,V5;V3,V6 -0.028807 0.00275725 -10.4477
## V2,V6;V3,V5 -0.00199317 0.000858599 -2.32142
## Estimating f_4 in 23 blocks of size 500
## total_nsnp 11853 nsnp 11853
## V2,V4;V5,V6 -0.0136211 0.00270404 -5.03731
## V2,V5;V4,V6 -0.0158089 0.00236794 -6.67621
## V2,V6;V4,V5 -0.00218783 0.00112569 -1.94354
## Estimating f_4 in 23 blocks of size 500
## total_nsnp 11853 nsnp 11853
## V3,V4;V5,V6 0.0131927 0.00326116 4.04541
## V3,V5;V4,V6 -0.00954853 0.00206454 -4.62502
## V3,V6;V4,V5 -0.0227413 0.00291064 -7.81316
As of July 2020, this is done on the thinned vcf file. The part for the SFS is fine in this way. But the statistics should be run on the whole filtered vcf file instead. At least the code is ready.
mkdir ${SUMST}PopGenome_splitchr
mkdir ${SUMST}PopGenome_splitchr/vcf
mkdir ${SUMST}PopGenome_splitchr/gff
mkdir ${SUMST}PopGenome_splitchr/fasta
cd ${SUMST}PopGenome_splitchr
# split into individual chromosome files starting form a complete vcf that contains exactly the loci you want
java -jar /Users/feurtey/Documents/Software/snpEff/SnpSift.jar \
split ${VCFDIR}${VCFNAME_NOMAF}.recode.vcf
# create directories named after each vcf file, moves each file into its own folder
for x in {1..21} ;
do
mkdir vcf/$x
mv ${VCFDIR}${VCFNAME_NOMAF}.recode.${x}.vcf vcf/${x}/${x}.vcf
mkdir gff/$x
grep "^${x}\t" ${GFFFILE} > ${SUMST}PopGenome_splitchr/gff/${x}/${x}.gff
~/Documents/Software/samtools-1.10/samtools faidx \
${REFFILE} ${x} > fasta/${x}.fa
done
# collect the set of chromosomes to process by scanning the vcf folders
Pop_vcf_dir = paste0(Sumstats_dir, "PopGenome_splitchr/vcf/")
Pop_gff_dir = paste0(Sumstats_dir, "PopGenome_splitchr/gff/")
Pop_ref_dir = paste0(Sumstats_dir, "PopGenome_splitchr/fasta/")
### Define population structure (can even be overlapping among groups if this is necessary)
# define individuals per pop (here it includes also a group that comprises all)
V1 <- as.character(high_anc_coef_snmf %>%
filter(Cluster == "V1") %>%
pull(Sample))
V2 <- high_anc_coef_snmf %>%
filter(Cluster == "V2") %>%
pull(Sample)
V3 <- high_anc_coef_snmf %>%
filter(Cluster == "V3") %>%
pull(Sample)
V4 <- high_anc_coef_snmf %>%
filter(Cluster == "V4") %>%
pull(Sample)
V5 <- high_anc_coef_snmf %>%
filter(Cluster == "V5") %>%
pull(Sample)
V6 <- high_anc_coef_snmf %>%
filter(Cluster == "V6") %>%
pull(Sample)
chr_vec <- system(paste0("ls ", Pop_vcf_dir), intern=T)
PopGenome_results_list = list()
syn_PopGenome_results_list = list()
# use this line below for testing (without running a loop)
#chr <- chr_vec[1]
### Looping through each chromosome, generate and record all statistics
for (chr in chr_vec) {
# tryCatch aborts the current loop if the "if" statement throws an error (to avoid processing an empty gff file). An empty gff could occur if the current chromosome lacks any annotated genes/features
tryCatch({
info <- file.info(paste0(Sumstats_dir, "PopGenome_splitchr/gff/", chr,"/", chr,".gff"))[,"size"]
if (info==0) stop(paste("No genes on chr", chr,"- skipping!"))
### define paths and load data
vcf.path <- paste0(Pop_vcf_dir, chr)
gff.path <- paste0(Pop_gff_dir, chr)
GENOME.class <- readData(vcf.path, format="VCF", include.unknown=TRUE, gffpath=gff.path)
all <- unlist(get.individuals(GENOME.class))
# check how many SNPs were loaded using
GENOME.class@n.biallelic.sites
# Important: the output only refers to pop1, pop2, pop3 instead of the listed pops below. You must record the correct order of pop names
GENOME.class <- set.populations(GENOME.class, list(all, V1, V2, V3, V4, V5, V6), diploid = FALSE)
GENOME.class <- set.synnonsyn(GENOME.class, ref.chr=paste0(Pop_ref_dir, chr,".fa"),save.codons=TRUE)
# estimate pairwise Fst and creates a slightly more convenient format
GENOME.class <- F_ST.stats(GENOME.class, mode="nucleotide")
pairwise.FST <- t(GENOME.class@nuc.F_ST.pairwise)
pairwise.FST <- as.data.frame(GENOME.class@nuc.F_ST.pairwise) %>%
mutate(pops = rownames(GENOME.class@nuc.F_ST.pairwise)) %>%
separate(pops, into = c("Pop1", "Pop2"), sep = "/")
results = data.frame(CHROM = chr,
position = GENOME.class@region.data@biallelic.sites,
CodingSNPS = GENOME.class@region.data@CodingSNPS[[1]],
synonymous = GENOME.class@region.data@synonymous[[1]],
ExonSNPS = GENOME.class@region.data@ExonSNPS[[1]])
colnames(results) = c("#CHROM", "POS", "CodingSNPS", "synonymous", "ExonSNPS")
PopGenome_results_list[[chr]] = results
#Getting statistics from the synonymous positions
GENOME.class.syn <- neutrality.stats(GENOME.class,subsites="syn")
GENOME.class.syn <- diversity.stats(GENOME.class.syn,subsites="syn")
data.Number_SNPs <- data.frame(GENOME.class.syn@n.segregating.sites, statistic="Number_SNPs", chromosome = chr)
data.TajD <- data.frame(GENOME.class.syn@Tajima.D,
statistic="TajimaD", chromosome = chr)
data.theta <- data.frame(GENOME.class.syn@theta_Watterson,
statistic="theta_Watterson", chromosome = chr)
data.Nucl_div <- data.frame(GENOME.class.syn@nuc.diversity.within,
statistic="Nuc_Div", chromosome = chr)
data.Nucl_div_per_site <- data.frame(GENOME.class.syn@nuc.diversity.within/GENOME.class.syn@n.sites,
statistic="Nuc_Div_per_site", chromosome = chr)
data.full <- rbind(data.Number_SNPs, data.theta,
data.TajD, data.Nucl_div, data.Nucl_div_per_site)
syn_PopGenome_results_list[[chr]] = data.full
# used to abort loop
}, error=function(e){})
}
PopGenome_results_whole = bind_rows(PopGenome_results_list)
PopGenome_results_whole %>%
filter(synonymous == 1) %>%
dplyr::select(`#CHROM`, POS) %>%
write_tsv(paste0(Sumstats_dir, "Synonymous_SNPs.list.txt"), col_names = F)
syn_PopGenome_results_per_chr = bind_rows(syn_PopGenome_results_list) %>%
na_if(., "NaN") %>%
dplyr::mutate_at(vars(starts_with("pop")), funs(as.numeric)) %>%
pivot_longer(-c(statistic, chromosome), names_to = "Population", values_to = "Estimate")
p1 = syn_PopGenome_results_per_chr %>%
filter(statistic == "TajimaD") %>%
ggplot(aes(x = Population, y = as.numeric(chromosome), fill = Estimate)) +
geom_tile() +
scale_fill_viridis_c() +
labs (x = "Populations", y = "Chromosome",
title = "Tajima's D in synonymous positions",
subtitle = str_wrap(paste(""), width = 70),
fill = "Tajima's D")+
theme_light()
p2 = syn_PopGenome_results_per_chr %>%
filter(statistic == "Nuc_Div_per_site") %>%
ggplot(aes(x = Population, y = as.numeric(chromosome), fill = Estimate)) +
geom_tile() +
scale_fill_viridis_c(option = "magma") +
labs (x = "Populations", y = "Chromosome",
title = str_wrap(paste("Nucleotide diversity per site ",
"in synonymous positions"), width = 40),
subtitle = str_wrap(paste(""), width = 70),
fill = "Nucleotide diversity")+
theme_light()
PopGenome_results_list = list()
### Looping through each chromosome, generate and record all statistics
for (chr in chr_vec) {
# tryCatch aborts the current loop if the "if" statement throws an error (to avoid processing an empty gff file). An empty gff could occur if the current chromosome lacks any annotated genes/features
tryCatch({
# skip the loop if the gff file is empty! Empty gff files can happen if a chromosome has no annotated genes.
info <- file.info(paste0(project_dir, "PopGenome_splitchr/gff/", chr,"/", chr,".gff"))[,"size"]
if (info==0) stop(paste("No genes on chr", chr,"- skipping!"))
### define paths and load data
vcf.path <- paste0(Pop_vcf_dir, chr)
gff.path <- paste0(Pop_gff_dir, chr)
GENOME.class <- readData(vcf.path, format="VCF", include.unknown=TRUE, gffpath=gff.path)
all <- unlist(get.individuals(GENOME.class))
# check how many SNPs were loaded using
GENOME.class@n.biallelic.sites
# Important: the output only refers to pop1, pop2, pop3 instead of the listed pops below. You must record the correct order of pop names
GENOME.class <- set.populations(GENOME.class, list(all, V1, V2, V3, V4, V5, V6), diploid = FALSE)
GENOME.class <- set.synnonsyn(GENOME.class, ref.chr=paste0(Pop_ref_dir, chr,".fa"),save.codons=TRUE)
# check assignment of populations using GENOME.class@populations (optional)
### split GENOME.class into genes (further options are: exon, etc.), feature must be mentioned in GFF file!
GENOME.class.split <- splitting.data(GENOME.class, subsites="gene")
### calculate summary stats per gene
GENOME.class.split <- neutrality.stats(GENOME.class.split)
GENOME.class.split <- diversity.stats(GENOME.class.split)
### Build dataframes with summary stats per gene and population
data.Number_SNPs <- data.frame(GENOME.class.split@n.segregating.sites, statistic="Number_SNPs", chromosome = chr, position=GENOME.class.split@region.names)
data.TajD <- data.frame(GENOME.class.split@Tajima.D, statistic="TajimaD", chromosome = chr, position=GENOME.class.split@region.names)
data.Nucl_div <- data.frame(GENOME.class.split@nuc.diversity.within, statistic="Nuc_Div", chromosome = chr, position=GENOME.class.split@region.names)
data.Nucl_div_per_site <- data.frame(GENOME.class.split@nuc.diversity.within/GENOME.class.split@n.sites, statistic="Nuc_Div_per_site", chromosome = chr, position=GENOME.class.split@region.names)
# syn, non-syn counts per gene (value = 0 equals non-synonymous change, value = 1 equals syn. change)
syn.count <- sapply(GENOME.class.split@region.data@synonymous, function(x) {sum(x == 1, na.rm = T)})
nonsyn.count <- sapply(GENOME.class.split@region.data@synonymous, function(x) {sum(x == 0, na.rm = T)})
data.Number_synSNPs <- data.frame(pop.1=syn.count, pop.2="NA", pop.3="NA", pop.4="NA",
pop.5="NA", pop.6="NA", pop.7="NA",
statistic="Number_synSNPs", chromosome = chr,
position=GENOME.class.split@region.names)
data.Number_nonsynSNPs <- data.frame(pop.1=nonsyn.count, pop.2="NA", pop.3="NA", pop.4="NA",
pop.5="NA", pop.6="NA", pop.7="NA",
statistic="Number_nonsynSNPs", chromosome = chr,
position=GENOME.class.split@region.names)
data.full <- rbind(data.Number_SNPs, data.Number_synSNPs, data.Number_nonsynSNPs,
data.TajD, data.Nucl_div, data.Nucl_div_per_site)
PopGenome_results_list[[chr]] = data.full
# used to abort loop
}, error=function(e){})
}
PopGenome_results_per_gene = bind_rows(PopGenome_results_list)
rm(PopGenome_results_list)
# Tajima's D
temp = PopGenome_results_per_gene %>%
dplyr::filter(statistic == "TajimaD") %>%
na_if(., "NaN") %>%
dplyr::mutate_at(vars(starts_with("pop")), funs(as.numeric)) %>%
pivot_longer(-c(statistic, chromosome, position), names_to = "Population", values_to = "Estimate") %>%
separate(position, into = c("start", "stop"))
temp_sum = temp %>% group_by(Population) %>%
dplyr::summarize(Median_value = median(Estimate, na.rm = T))
temp = full_join(temp,temp_sum)
p3 = ggplot(temp, aes(x = Population, y = as.numeric(Estimate), fill = Median_value)) +
geom_violin() +
geom_boxplot(width = 0.1,
outlier.shape = NA, color = "white")+
labs (x = "", y = "Tajima's D",
title = "Tajima's D per population in genes",
subtitle = str_wrap(paste(""), width = 70),
fill = "Median per pop") +
theme_light() +
scale_fill_viridis_c()
# Nucleotide diversity
temp = PopGenome_results_per_gene %>%
dplyr::filter(statistic == "Nuc_Div_per_site") %>%
na_if(., "NaN") %>%
dplyr::mutate_at(vars(starts_with("pop")), funs(as.numeric)) %>%
pivot_longer(-c(statistic, chromosome, position), names_to = "Population", values_to = "Estimate") %>%
separate(position, into = c("start", "stop"))
temp_sum = temp %>% group_by(Population) %>%
dplyr::summarize(Median_value = median(Estimate, na.rm = T))
temp = full_join(temp,temp_sum)
p4 = ggplot(temp, aes(x = Population, y = log(as.numeric(Estimate)), fill = Median_value)) +
geom_violin() +
labs (x = "", y = str_wrap(paste("Nucleotide diversity per site",
"in log scale"), width = 30),
title = str_wrap(paste("Nucleotide diversity per site ",
"per population in genes"), width = 50),
subtitle = str_wrap(paste(""), width = 70),
fill = "Median per pop")+
theme_light() +
scale_fill_viridis_c(option = "magma")+
geom_boxplot(width = 0.1,
outlier.shape = NA, color = "white")
plot_grid(p1, p2, p3, p4, labels = c('A', 'B', 'C', 'D'), label_size = 12, ncol = 2)
#Filter vcf file for only synonymous SNPs
vcftools \
--vcf ${VCFDIR}${VCFNAME_NOMAF}.recode.vcf \
--positions ${SUMST}Synonymous_SNPs.list.txt \
--remove-filtered-all \
--keep $ZTLIST.txt \
--non-ref-ac-any 1 \
--max-missing 1.0 --min-alleles 2 --max-alleles 2 \
--recode \
--out ${SUMST}${VCFNAME_NOMAF}.pass_noNA.syn
#Create one count file per population
for x in {1..6} ;
do
cut -f 1,2 ${POPSTR}${VCFNAME}.pass_noNA.high_anc_coef_snmf.tsv \
| grep -w "V${x}" | cut -f 1 \
> ${SUMST}${VCFNAME}.pass_noNA.high_anc_coef_snmf.pop${x}.ind
vcftools \
--vcf ${SUMST}${VCFNAME_NOMAF}.pass_noNA.syn.recode.vcf \
--keep ${SUMST}${VCFNAME}.pass_noNA.high_anc_coef_snmf.pop${x}.ind \
--remove-filtered-all \
--counts \
--out ${SUMST}${VCFNAME_NOMAF}.pass_noNA.high_anc_coef_snmf.pop${x}
done
#TODO: Filter vcf file for monomorphic SNP in outgroup and use to create dadi output?
/Users/feurtey/Documents/Software/vcftools_jydu/src/cpp/vcftools \
--vcf ${SUMST}${VCFNAME_NOMAF}.pass_noNA.syn.recode.vcf \
--recode-INFO-all --recode \
--out ${SUMST}${VCFNAME_NOMAF}.pass_noNA.syn.for_dadi \
--max-indv 1
/Users/feurtey/Documents/Software/bcftools/bcftools query -f '%CHROM\t%POS\t%REF[\t%TGT]\n' \
${SUMST}${VCFNAME_NOMAF}.pass_noNA.syn.for_dadi.recode.vcf > \
${SUMST}${VCFNAME_NOMAF}.pass_noNA.syn.for_dadi.tab
SFS_counts_list = list()
for (x in c(1:6)) {
count_file = paste0(Sumstats_dir, vcf_name_nomaf,
".pass_noNA.high_anc_coef_snmf.pop",
x,".frq.count")
counts = read_tsv(count_file,
skip = 1, col_names = F) %>%
separate(X5, into = c("REF allele", "REF_count")) %>%
separate(X6, into = c("ALT allele", "ALT_count")) %>%
dplyr::mutate(MAC = pmin(as.numeric(REF_count), as.numeric(ALT_count))) %>%
dplyr::mutate(Pop = x)
SFS_counts_list[[x]] = counts
}
SFS_counts = bind_rows(SFS_counts_list)
SFS_counts %>%
filter(MAC > 0) %>%
dplyr::group_by(Pop, MAC) %>%
dplyr::count(name = "count") %>%
ggplot(aes(as.numeric(x = as.numeric(MAC)), y = count)) +
geom_bar(stat = "identity") +
labs(title = "Site frequency spectrum from the different populations") +
theme_bw() +
facet_wrap(Pop~., scales = "free")
Previously, based on the study of TE and RIP in 9 Z.tritici genomes, a hypothesis was drawn. Figure from Lorrain et al. 2020: Repeat Induced Point (RIP) mutations in transposons of Zymoseptoria spp. Histograms of Composite RIP index (CRI) frequencies of transposons estimated using a 50bp sliding windows approach as follows: CRI =(TpA/ ApT) – (CpA + TpG/ ApC + GpT) for A) Z. passerinii, Z. ardabiliae, Z. brevis and Z. pseudotritici, and B) Iranian Z. tritici isolates and C) European Z. tritici isolates. Vertical dash lines exhibit the threshold (0) above which CRI values indicate a RIP signature.
The lower RIP in TEs of European samples, as compared to Iranian isolates, could indicate a loss of RIP in Z.tritici when it spread out of its area of origin. Here, I would like to investigate this possibility in the different pop.
The data plotted here are based on the following steps:
#Run detection of TE/RIP
dir=/legserv/NGS_data/Zymoseptoria/Aligned_reads_Nuc_Mito_genomes/SRA_2019/PE/Bam/ ;
dir=/legserv/NGS_data/Zymoseptoria/Aligned_reads_Nuc_Mito_genomes/MM_NZ_TAS/Bam/
list_name=$(ls ${dir}*bai | sed "s|${dir}||" | sed 's/.bam.bai//' )
for x in $list_name ; do
echo $x;
sbatch TE_GC_RIP_bowtie.sh $x ${dir} ;
sleep 2m
done
#Once everything is finished from above
#--------------------------------------
#Gather all results
grep "Compo" /data3/alice/WW_project/WW_TE_RIP/0_RIP_estimation/3_RIP_estimation/*txt | \
sed 's|/data2/alice/WW_project/4_TE_RIP/0_RIP_estimation/3_RIP_estimation/||' | \
sed 's/.txt:Composite//' | sed 's/\./ /' \
> /userhome/alice/WW_project/WW_TE_RIP/Composite_index.txt
#Gathering with too many files
rm /data2/alice/WW_project/4_TE_RIP/0_RIP_estimation/Composite_index.txt
grep "Compo" /data2/alice/WW_project/4_TE_RIP/0_RIP_estimation/3_RIP_estimation/*RIP_est.txt | \
sed 's|/data2/alice/WW_project/4_TE_RIP/0_RIP_estimation/3_RIP_estimation/||' | \
sed 's/.txt:Composite//' | sed 's/\./\t/' \
>> /data2/alice/WW_project/4_TE_RIP/0_RIP_estimation/Composite_index.txt
grep "GC" /data2/alice/WW_project/4_TE_RIP/0_RIP_estimation/3_RIP_estimation/*RIP_est.txt | \
sed 's|/data2/alice/WW_project/4_TE_RIP/0_RIP_estimation/3_RIP_estimation/||' | \
sed 's/.txt:GC//' | sed 's/\./\t/' \
>> /data2/alice/WW_project/4_TE_RIP/0_RIP_estimation/GC_percent.txt
REF_NAME=/data2/alice/WW_project/0_Data/Badet_BMC_Biology_2020_TE_consensus_sequences
ls_TE=$(grep ">" ${REF_NAME}.fasta | sed 's/>//')
for TE in $ls_TE;
do
grep "Compo" /data2/alice/WW_project/4_TE_RIP/0_RIP_estimation/3_RIP_estimation/*${TE}.txt | \
sed 's|/data2/alice/WW_project/4_TE_RIP/0_RIP_estimation/3_RIP_estimation/||' | \
sed 's/.txt:Composite//' | sed 's/\./\t/' \
>> /data2/alice/WW_project/4_TE_RIP/0_RIP_estimation/Composite_index.txt
done
#Get read number per sample
echo "ID_file Te_aligned_reads Total_reads" > /data2/alice/WW_project/4_TE_RIP/0_RIP_estimation/Nb_reads.txt
for fq in /data2/alice/WW_project/4_TE_RIP/0_RIP_estimation/2_Aln_TE_consensus/*fq.gz ;
do
name=$(echo $fq | sed 's|/data2/alice/WW_project/4_TE_RIP/0_RIP_estimation/2_Aln_TE_consensus/||' | sed 's/.fq.gz//' );
echo $name
total_reads=$(~/Software/samtools-1.10/samtools flagstat /data2/alice/WW_project/1_Variant_calling/0_Mappings/${name}.bam | grep "mapped" | grep "with" -v | cut -f1 -d " ");
aln_reads=$(~/Software/samtools-1.10/samtools flagstat /data2/alice/WW_project/4_TE_RIP/0_RIP_estimation/2_Aln_TE_consensus/${name}.bam | grep "mapped" | grep "with" -v | cut -f1 -d " ");
#aln_reads=$(zcat /data2/alice/WW_project/4_TE_RIP/0_RIP_estimation/2_Aln_TE_consensus/${name}.fq.gz | paste - - - - | wc -l | cut -f 1);
echo $name $aln_reads $total_reads >> /data2/alice/WW_project/4_TE_RIP/0_RIP_estimation/Nb_reads.txt;
done
#Get read number per sample per TE
rm /data2/alice/WW_project/4_TE_RIP/0_RIP_estimation/Nb_reads_per_TE.txt
for bamF in /data2/alice/WW_project/4_TE_RIP/0_RIP_estimation/2_Aln_TE_consensus/*bam ;
do
echo $bamF ;
name=$(echo $bamF | sed 's|/data2/alice/WW_project/4_TE_RIP/0_RIP_estimation/2_Aln_TE_consensus/||' | sed 's/.bam//' ) ;
/home/alice/Software/samtools-1.10/samtools idxstats $bamF > temp.txt ;
awk -v var="$name" '{OFS="\t"} {print var, $1, $2, $3, $4}' temp.txt >> \
/data2/alice/WW_project/4_TE_RIP/0_RIP_estimation/Nb_reads_per_TE.txt ;
done
First, I mapped the reads on the TE consensus created by Ursula based on Thomas’s pangenome. I visualize the results here in terms of percentage of reads mapping on these consensus.
#Reading in the data
TE_qty = read_delim(paste0(RIP_DIR, "Nb_reads.txt"), delim = " ") %>%
dplyr::filter(Total_reads > Te_aligned_reads) %>%
dplyr::mutate(Percent_TE_Reads = Te_aligned_reads * 100 / Total_reads) %>%
left_join(readxl::read_excel(paste0(metadata_dir, "Zt_global_data_set_09April2020_DC_AFperso.xlsx"),
sheet = 1, n_max = 1000)) %>%
unite(Continent_Region, Country, ID_file, col = "for_display", remove = F)
world_avg <-
TE_qty %>%
dplyr::summarize(avg = mean(as.numeric(Percent_TE_Reads), na.rm = T)) %>%
pull(avg)
#Building the basic violin plot per continent
TE_prop = TE_qty %>%
filter(!is.na(Continent_Region)) %>%
ggplot(aes(x = Continent_Region, y = Percent_TE_Reads, fill = Continent_Region)) +
geom_hline(aes(yintercept = world_avg),
color = "gray30", size = 0.6, linetype = "dashed") +
geom_violin(alpha = .8) +
stat_summary(fun = mean, geom = "point", size = 2, color = "grey30") +
theme_classic() + Fill_Continent + Color_Continent +
theme_cowplot() +
theme(axis.text.x = element_text(angle = 40, hjust = 1)) +
labs (x = "", y = str_wrap("Percentage of reads", width = 30),
title = "Amount of reads mapping on TE consensus per continent",
subtitle = str_wrap(paste(""), width = 70))
#TE_prop
#One-way ANOVA with blocks
##Define linear model
model = lm(Percent_TE_Reads ~ Continent_Region + Collection ,
data=TE_qty)
summary(model) ### Will show overall p-value and r-squared
##
## Call:
## lm(formula = Percent_TE_Reads ~ Continent_Region + Collection,
## data = TE_qty)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.3156 -1.2556 -0.1235 1.1336 8.2952
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 17.15354 0.91848 18.676 < 2e-16 ***
## Continent_RegionAsia -0.35553 1.54017 -0.231 0.817550
## Continent_RegionEurope 0.22300 0.57541 0.388 0.698543
## Continent_RegionMiddle East -1.89939 0.62658 -3.031 0.002579 **
## Continent_RegionNorth America 0.94382 0.59419 1.588 0.112917
## Continent_RegionOceania 2.54310 0.69750 3.646 0.000298 ***
## Continent_RegionSouth America 1.26063 0.76898 1.639 0.101859
## CollectionC2 -0.02222 0.86040 -0.026 0.979405
## CollectionC3 -0.67798 0.96048 -0.706 0.480638
## CollectionHartmann -7.54541 0.77820 -9.696 < 2e-16 ***
## CollectionJGI 3.83328 0.77146 4.969 9.68e-07 ***
## CollectionMM_NZ_TAS 3.49334 0.87795 3.979 8.10e-05 ***
## CollectionThird_batch_2018_BM_TM -0.77949 0.78368 -0.995 0.320459
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.025 on 437 degrees of freedom
## (75 observations deleted due to missingness)
## Multiple R-squared: 0.8833, Adjusted R-squared: 0.8801
## F-statistic: 275.6 on 12 and 437 DF, p-value: < 2.2e-16
##Conduct analysis of variance
Anova(model,type = "II")
## Anova Table (Type II tests)
##
## Response: Percent_TE_Reads
## Sum Sq Df F value Pr(>F)
## Continent_Region 423.4 6 17.21 < 2.2e-16 ***
## Collection 8381.2 6 340.69 < 2.2e-16 ***
## Residuals 1791.7 437
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(model)
##
## Call:
## lm(formula = Percent_TE_Reads ~ Continent_Region + Collection,
## data = TE_qty)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.3156 -1.2556 -0.1235 1.1336 8.2952
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 17.15354 0.91848 18.676 < 2e-16 ***
## Continent_RegionAsia -0.35553 1.54017 -0.231 0.817550
## Continent_RegionEurope 0.22300 0.57541 0.388 0.698543
## Continent_RegionMiddle East -1.89939 0.62658 -3.031 0.002579 **
## Continent_RegionNorth America 0.94382 0.59419 1.588 0.112917
## Continent_RegionOceania 2.54310 0.69750 3.646 0.000298 ***
## Continent_RegionSouth America 1.26063 0.76898 1.639 0.101859
## CollectionC2 -0.02222 0.86040 -0.026 0.979405
## CollectionC3 -0.67798 0.96048 -0.706 0.480638
## CollectionHartmann -7.54541 0.77820 -9.696 < 2e-16 ***
## CollectionJGI 3.83328 0.77146 4.969 9.68e-07 ***
## CollectionMM_NZ_TAS 3.49334 0.87795 3.979 8.10e-05 ***
## CollectionThird_batch_2018_BM_TM -0.77949 0.78368 -0.995 0.320459
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.025 on 437 degrees of freedom
## (75 observations deleted due to missingness)
## Multiple R-squared: 0.8833, Adjusted R-squared: 0.8801
## F-statistic: 275.6 on 12 and 437 DF, p-value: < 2.2e-16
hist(residuals(model), col="darkgray")
#Post-hoc analysis: mean separation tests
library(multcomp)
library(lsmeans)
marginal = lsmeans(model, ~ Continent_Region)
pairs(marginal, adjust="tukey")
## contrast estimate SE df t.ratio p.value
## Africa - Asia 0.356 1.540 437 0.231 1.0000
## Africa - Europe -0.223 0.575 437 -0.388 0.9997
## Africa - Middle East 1.899 0.627 437 3.031 0.0410
## Africa - North America -0.944 0.594 437 -1.588 0.6899
## Africa - Oceania -2.543 0.697 437 -3.646 0.0055
## Africa - South America -1.261 0.769 437 -1.639 0.6568
## Asia - Europe -0.579 1.467 437 -0.394 0.9997
## Asia - Middle East 1.544 1.497 437 1.031 0.9466
## Asia - North America -1.299 1.485 437 -0.875 0.9761
## Asia - Oceania -2.899 1.528 437 -1.897 0.4832
## Asia - South America -1.616 1.547 437 -1.044 0.9433
## Europe - Middle East 2.122 0.385 437 5.507 <.0001
## Europe - North America -0.721 0.330 437 -2.183 0.3071
## Europe - Oceania -2.320 0.477 437 -4.860 <.0001
## Europe - South America -1.038 0.612 437 -1.695 0.6196
## Middle East - North America -2.843 0.361 437 -7.870 <.0001
## Middle East - Oceania -4.442 0.484 437 -9.178 <.0001
## Middle East - South America -3.160 0.665 437 -4.751 0.0001
## North America - Oceania -1.599 0.440 437 -3.632 0.0058
## North America - South America -0.317 0.635 437 -0.499 0.9989
## Oceania - South America 1.282 0.732 437 1.751 0.5818
##
## Results are averaged over the levels of: Collection
## P value adjustment: tukey method for comparing a family of 7 estimates
CLD = cld(marginal,
alpha = 0.05,
Letters = letters, ### Use lower-case letters for .group
adjust = "tukey") ### Tukey-adjusted p-values
CLD
## Continent_Region lsmean SE df lower.CL upper.CL .group
## Middle East 15.0 0.358 437 14.0 16.0 a
## Asia 16.6 1.465 437 12.6 20.5 abc
## Africa 16.9 0.568 437 15.4 18.4 b
## Europe 17.1 0.211 437 16.6 17.7 b
## North America 17.9 0.298 437 17.1 18.7 b
## South America 18.2 0.606 437 16.5 19.8 bc
## Oceania 19.5 0.403 437 18.4 20.5 c
##
## Results are averaged over the levels of: Collection
## Confidence level used: 0.95
## Conf-level adjustment: sidak method for 7 estimates
## P value adjustment: tukey method for comparing a family of 7 estimates
## significance level used: alpha = 0.05
CLD$.group=gsub(" ", "", CLD$.group)
### Plot
TE_prop +
geom_text(data = CLD, aes(x = Continent_Region, label = .group, y = 34), color = "black")
The statistics used here are a one-way ANOVA with block. Blocks are used in an analysis of variance or similar models in order to account for suspected variation from factors other than the treatments or main independent variables being investigated. Here I considered the collection as the confounging factor. It definitely has an effect and was thus accounted for in the statistics related to TE content and to RIP level.
I would like to go finer in the TE content analysis and look at the reads aligning on each consensus sequence.
reads_per_TE = read_delim(paste0(RIP_DIR, "Nb_reads_per_TE.txt"), delim = "\t",
col_names = c("ID_file", "TE", "Length",
"# mapped read-segments", "# unmapped read-segments")) %>%
separate(TE, into = c("Superfamily", "TE_id"), sep = "_", remove = F) %>%
dplyr::mutate(Order = ifelse(!grepl('^D',TE), "Class II (DNA transposons)", "Class I (retrotransposons)")) %>%
left_join(readxl::read_excel(paste0(metadata_dir, "Zt_global_data_set_09April2020_DC_AFperso.xlsx"),
sheet = 1, n_max = 1000)) %>%
unite(Continent_Region, Country, ID_file, col = "for_display", remove = F)
temp = reads_per_TE %>% group_by(ID_file) %>%
dplyr::summarise(Reads_mapped_per_TE = sum(`# mapped read-segments`))
reads_per_TE = left_join(reads_per_TE, temp) %>%
dplyr::mutate(Normalized_nb_reads_mapped = `# mapped read-segments` / Reads_mapped_per_TE)
#reads_per_TE %>%
# mutate(ID_file = fct_reorder(ID_file, Continent_Region)) %>%
# ggplot(aes(x = ID_file, y = Normalized_nb_reads_mapped, fill = Superfamily)) +
# geom_bar(stat = "identity")
Let’s try to make a PCA based on the proportion of reads mapping on each TE. Is there a geographical clustering in the TE content?
TE_PCA_mat = reads_per_TE %>%
dplyr::select(ID_file, Continent_Region, for_display, TE, Normalized_nb_reads_mapped) %>%
spread(key = TE, value = as.numeric(Normalized_nb_reads_mapped))
temp =as.matrix(sapply(TE_PCA_mat[,c(5:ncol(TE_PCA_mat))], as.numeric))
temp = temp[,apply(temp, 2, var, na.rm=TRUE) != 0]
TE.pca = prcomp(temp, center = TRUE,scale. = TRUE)
#summary(TE.pca)
#library(ggfortify)
#autoplot(TE.pca, data = TE_PCA_mat, colour = 'Continent_Region', shape = T, label.size = 3)
p1 = cbind(TE_PCA_mat, as.data.frame(TE.pca$x)) %>%
ggplot(aes(x = PC1, y = PC2, col = Continent_Region, text = for_display)) +
geom_point(alpha = 0.6) + theme_bw() + Color_Continent +
labs(title = "PCA based on normalized reads mapping on each TE consensus")
ggplotly(p1)
p2 = cbind(TE_PCA_mat, as.data.frame(TE.pca$x)) %>%
ggplot(aes(x = PC2, y = PC3, col = Continent_Region, text = for_display)) +
geom_point(alpha = 0.6) + theme_bw() + Color_Continent
ggplotly(p2)
p3 = cbind(TE_PCA_mat, as.data.frame(TE.pca$x)) %>%
ggplot(aes(x = PC3, y = PC4, col = Continent_Region, text = for_display)) +
geom_point(alpha = 0.6) + theme_bw() + Color_Continent
ggplotly(p3)
Let’s now plot the same results but all 4 axis against all and not interactive.
#And now all against all but not interactive
temp = as.tibble(cbind(TE_PCA_mat, as.data.frame(TE.pca$x))) %>%
dplyr::select(for_display, Continent_Region, PC1, PC2, PC3, PC4)
p = ggpairs(temp, columns = c(3:6), ggplot2::aes(col=Continent_Region, fill = Continent_Region, alpha = 0.6),
title = "PCA based on normalized reads mapping on each TE consensus",
upper = list(continuous = "points", combo = "box_no_facet"))
for(i in 1:p$nrow) {
for(j in 1:p$ncol){
p[i,j] <- p[i,j] + theme_bw() + Color_Continent +Fill_Continent
}
}
p
It does look like there is clustering of samples according to geography. This is interesting as it shows that the types of TEs are different in different populations.
I now look at the repeat-induced point mutations in reads that map on the different TE consensus. We expect to see differences in the different geographical groups so I start by visualizing this.
RIP=read_tsv(paste0(RIP_DIR, "Composite_index.txt"),
col_names = c("ID_file", "TE", "Composite_median", "Composite_mean" )) %>%
separate(TE, into = c("Superfamily", "TE_id"), sep = "_", remove = F) %>%
mutate(Order = ifelse(!grepl('^D',TE), "Class II (DNA transposons)", "Class I (retrotransposons)"))%>%
left_join(readxl::read_excel(paste0(metadata_dir, "Zt_global_data_set_09April2020_DC_AFperso.xlsx"),
sheet = 1, n_max = 1000)) %>%
unite(Continent_Region, Country, ID_file, col = "for_display", remove = F) %>%
left_join(., reads_per_TE)
#DONE: merge with reads_per_TE to be able to filter the points that have too few reads mapped!
#Per continent
world_RIP_avg <-
RIP %>%
filter(TE == "RIP_est") %>%
dplyr::summarize(avg = mean(as.numeric(Composite_median), na.rm = T)) %>%
pull(avg)
temp = RIP %>%
filter(TE == "RIP_est") %>%
filter(!is.na(Continent_Region)) %>%
group_by(Continent_Region) %>%
dplyr::mutate(region_avg = mean(as.numeric(Composite_median), na.rm = T))
RIP_plot = ggplot(temp, aes(x = Continent_Region,
y = as.numeric(Composite_median),
color = Continent_Region)) +
coord_flip() +
geom_segment(aes(x = Continent_Region, xend = Continent_Region,
y = world_RIP_avg, yend = region_avg), size = 0.8) +
geom_jitter(size = 1.5, alpha = 0.2, width = 0.2) +
geom_hline(aes(yintercept = world_RIP_avg), color = "gray70", size = 0.6) +
stat_summary(fun = mean, geom = "point", size = 5) +
Color_Continent +
theme_cowplot() +
theme(legend.position = "none",
axis.text.x = element_text(angle = 40, hjust = 1)) +
labs (x = "", y = "RIP composite index",
title = "RIP levels per continents",
subtitle = str_wrap(paste("The RIP levels in reads mapping on TE consensus",
"are high in the Middle-East",
"and low in North America in particular."), width = 70))
#Statistical test
#One-way ANOVA with blocks
##Define linear model
model = lm(as.numeric(Composite_median) ~ Continent_Region + Collection ,
data=temp)
summary(model) ### Will show overall p-value and r-squared
##
## Call:
## lm(formula = as.numeric(Composite_median) ~ Continent_Region +
## Collection, data = temp)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.50684 -0.04543 0.00169 0.04413 0.38740
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.551853 0.045539 34.077 < 2e-16 ***
## Continent_RegionAsia -0.094539 0.083608 -1.131 0.2588
## Continent_RegionEurope -0.159444 0.031231 -5.105 4.92e-07 ***
## Continent_RegionMiddle East 0.268819 0.033998 7.907 2.13e-14 ***
## Continent_RegionNorth America -0.378820 0.032239 -11.750 < 2e-16 ***
## Continent_RegionOceania -0.298857 0.037481 -7.974 1.33e-14 ***
## Continent_RegionSouth America -0.172791 0.041745 -4.139 4.17e-05 ***
## CollectionC2 0.010863 0.042068 0.258 0.7964
## CollectionC3 0.001281 0.048029 0.027 0.9787
## CollectionHartmann -0.237595 0.037030 -6.416 3.60e-10 ***
## CollectionJGI 0.111332 0.036620 3.040 0.0025 **
## CollectionMM_NZ_TAS 0.189711 0.042851 4.427 1.20e-05 ***
## CollectionThird_batch_2018_BM_TM 0.077286 0.037390 2.067 0.0393 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1099 on 442 degrees of freedom
## (107 observations deleted due to missingness)
## Multiple R-squared: 0.826, Adjusted R-squared: 0.8213
## F-statistic: 174.9 on 12 and 442 DF, p-value: < 2.2e-16
##Conduct analysis of variance
Anova(model,type = "II")
## Anova Table (Type II tests)
##
## Response: as.numeric(Composite_median)
## Sum Sq Df F value Pr(>F)
## Continent_Region 14.0300 6 193.52 < 2.2e-16 ***
## Collection 9.8296 6 135.59 < 2.2e-16 ***
## Residuals 5.3407 442
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(model)
##
## Call:
## lm(formula = as.numeric(Composite_median) ~ Continent_Region +
## Collection, data = temp)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.50684 -0.04543 0.00169 0.04413 0.38740
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.551853 0.045539 34.077 < 2e-16 ***
## Continent_RegionAsia -0.094539 0.083608 -1.131 0.2588
## Continent_RegionEurope -0.159444 0.031231 -5.105 4.92e-07 ***
## Continent_RegionMiddle East 0.268819 0.033998 7.907 2.13e-14 ***
## Continent_RegionNorth America -0.378820 0.032239 -11.750 < 2e-16 ***
## Continent_RegionOceania -0.298857 0.037481 -7.974 1.33e-14 ***
## Continent_RegionSouth America -0.172791 0.041745 -4.139 4.17e-05 ***
## CollectionC2 0.010863 0.042068 0.258 0.7964
## CollectionC3 0.001281 0.048029 0.027 0.9787
## CollectionHartmann -0.237595 0.037030 -6.416 3.60e-10 ***
## CollectionJGI 0.111332 0.036620 3.040 0.0025 **
## CollectionMM_NZ_TAS 0.189711 0.042851 4.427 1.20e-05 ***
## CollectionThird_batch_2018_BM_TM 0.077286 0.037390 2.067 0.0393 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1099 on 442 degrees of freedom
## (107 observations deleted due to missingness)
## Multiple R-squared: 0.826, Adjusted R-squared: 0.8213
## F-statistic: 174.9 on 12 and 442 DF, p-value: < 2.2e-16
hist(residuals(model), col="darkgray")
#Post-hoc analysis: mean separation tests
marginal = lsmeans(model, ~ Continent_Region)
pairs(marginal, adjust="tukey")
## contrast estimate SE df t.ratio p.value
## Africa - Asia 0.0945 0.0836 442 1.131 0.9184
## Africa - Europe 0.1594 0.0312 442 5.105 <.0001
## Africa - Middle East -0.2688 0.0340 442 -7.907 <.0001
## Africa - North America 0.3788 0.0322 442 11.751 <.0001
## Africa - Oceania 0.2989 0.0375 442 7.974 <.0001
## Africa - South America 0.1728 0.0417 442 4.139 0.0008
## Asia - Europe 0.0649 0.0796 442 0.815 0.9834
## Asia - Middle East -0.3634 0.0813 442 -4.472 0.0002
## Asia - North America 0.2843 0.0806 442 3.528 0.0084
## Asia - Oceania 0.2043 0.0828 442 2.468 0.1737
## Asia - South America 0.0783 0.0840 442 0.932 0.9673
## Europe - Middle East -0.4283 0.0209 442 -20.474 <.0001
## Europe - North America 0.2194 0.0179 442 12.240 <.0001
## Europe - Oceania 0.1394 0.0255 442 5.473 <.0001
## Europe - South America 0.0133 0.0332 442 0.402 0.9997
## Middle East - North America 0.6476 0.0196 442 33.023 <.0001
## Middle East - Oceania 0.5677 0.0259 442 21.898 <.0001
## Middle East - South America 0.4416 0.0361 442 12.234 <.0001
## North America - Oceania -0.0800 0.0235 442 -3.401 0.0129
## North America - South America -0.2060 0.0345 442 -5.976 <.0001
## Oceania - South America -0.1261 0.0394 442 -3.199 0.0247
##
## Results are averaged over the levels of: Collection
## Note: contrasts are still on the as.numeric scale
## P value adjustment: tukey method for comparing a family of 7 estimates
CLD_RIP = cld(marginal,
alpha = 0.05,
Letters = letters, ### Use lower-case letters for .group
adjust = "tukey") ### Tukey-adjusted p-values
CLD_RIP
## Continent_Region lsmean SE df lower.CL upper.CL .group
## North America 1.19 0.0159 442 1.15 1.24 a
## Oceania 1.27 0.0213 442 1.22 1.33 b
## South America 1.40 0.0327 442 1.31 1.49 c
## Europe 1.41 0.0111 442 1.38 1.44 c
## Asia 1.48 0.0795 442 1.26 1.69 bcd
## Africa 1.57 0.0307 442 1.49 1.66 d
## Middle East 1.84 0.0192 442 1.79 1.89 e
##
## Results are averaged over the levels of: Collection
## Results are given on the as.numeric (not the response) scale.
## Confidence level used: 0.95
## Conf-level adjustment: sidak method for 7 estimates
## Note: contrasts are still on the as.numeric scale
## P value adjustment: tukey method for comparing a family of 7 estimates
## significance level used: alpha = 0.05
CLD_RIP$.group=gsub(" ", "", CLD_RIP$.group)
text_y = (max(as.numeric(temp$Composite_median), na.rm = T) + 0.1*max(as.numeric(temp$Composite_median), na.rm = T))
RIP_plot +
geom_text(data = CLD_RIP, aes(x = Continent_Region,
y = text_y,
label = .group), color = "black")
It is known that different TE groups, in particular the MITEs, which are particularly small are less RIPped than other types of TEs. I wanted to check whether we saw such a pattern and so I visualize here the RIP per superfamily of TEs and then as related to the size of the consensus.
#Per TE superfamily
RIP %>%
filter(Normalized_nb_reads_mapped > 0.0001) %>%
group_by(Superfamily)%>%
mutate(median_Superfamily=median(Composite_median, na.rm = T) )%>%
ggplot(aes(x = Superfamily,
y = as.numeric(Composite_median),
fill = median_Superfamily)) +
geom_boxplot(outlier.shape = NA) +
theme_bw() +
ylab("Median of composite index on TE reads") +
theme(legend.position = "none",
axis.text.x = element_text(angle = 40, hjust = 1)) +
ylim(-1, 5) + geom_hline( yintercept = 0, color = "navy")
temp = RIP %>%
filter(Normalized_nb_reads_mapped > 0.0001) %>%
group_by(Superfamily, Continent_Region, Order)%>%
dplyr::summarize(median_Superfamily=median(Composite_median, na.rm = T) )
temp %>%
filter(Order == "Class I (retrotransposons)") %>%
ggplot(aes(x = Superfamily,
y = median_Superfamily,
fill = Continent_Region)) +
geom_bar(stat = "identity", position=position_dodge()) +
Fill_Continent +
theme_bw() +
ylab("Median of composite index on TE reads") +
theme(axis.text.x = element_text(angle = 40, hjust = 1))
temp %>%
filter(Order == "Class II (DNA transposons)") %>%
ggplot(aes(x = Superfamily,
y = median_Superfamily,
fill = Continent_Region)) +
geom_bar(stat = "identity", position=position_dodge()) +
Fill_Continent +
theme_bw() +
ylab("Median of composite index on TE reads") +
theme(axis.text.x = element_text(angle = 40, hjust = 1))
#As relating to TE size
#NB: in the following plot, the alpha parameter is set to make the TE without any reads (or near so) invisible
#This means that not all consensus are visible. In particular, some, annotated by Ursula as "verylowcopy" are not.
TE_consensus_faidx =read_tsv(paste0(TE_RIP_dir, "Badet_BMC_Biology_2020_TE_consensus_sequences.fasta.fai"),
col_names = c("TE", "length", "offset",
"number of bases per line", "number of bytes per line"))
p = inner_join(RIP, TE_consensus_faidx) %>%
dplyr::group_by(Order, TE, length) %>%
filter (TE != "RIP_est") %>%
filter(!is.na(Normalized_nb_reads_mapped)) %>%
dplyr::summarize(median_per_consensus=median(Composite_median, na.rm = T),
read_mapped = median(Normalized_nb_reads_mapped), na.rm = T) %>%
ggplot(aes(x = log(length),
y = median_per_consensus,
color = Order)) +
geom_point(aes( text = TE,
alpha = log(read_mapped))) + theme_bw() +
ylim(c(-2, 4)) + geom_hline(yintercept = 0) +
labs(x = "TE length (log-scale)",
y = "Median of RIP composite index",
title = str_wrap(paste("Median of the RIP composite index for each TE consensus",
"against the length of the consensus sequence"), width = 60)) # + geom_smooth(span = 1.5, fill = NA, size =0.7)
ggplotly(p)
p = inner_join(RIP, TE_consensus_faidx) %>%
filter (TE != "RIP_est") %>%
dplyr::group_by(Order, TE, length) %>%
dplyr::summarize(median_per_consensus=median(Composite_median, na.rm = T),
read_mapped = median(Normalized_nb_reads_mapped), na.rm = T) %>%
filter(str_detect(TE, pattern = "SINE") | str_detect(TE, pattern = "MITE") ) %>%
ggplot(aes(x = log(length),
y = median_per_consensus,
color = Order, text = TE,
alpha = log2(read_mapped))) +
geom_point() + theme_bw() +
ylim(c(-2, 4)) + geom_hline(yintercept = 0)
ggplotly(p)
Finally, I look at the relation between the amount of reads mapping on TE consensus and the level of RIP detected. I also investigate several possible bias.
TE_RIP = inner_join(TE_qty, RIP %>%
filter(TE == "RIP_est") )
TE_RIP$Date_Year[is.na(TE_RIP$Date_Year)] <- "Unknown"
temp = TE_RIP %>%
group_by(Continent_Region) %>%
dplyr::summarize(TE_qty = mean(Percent_TE_Reads, na.rm = T),
Composite_median = mean(as.numeric(Composite_median), na.rm = T)) %>%
dplyr::mutate(for_display = Continent_Region)
#TE and RIP together
t = ggplot(TE_RIP, aes(as.numeric(Percent_TE_Reads),
as.numeric(Composite_median),
color = Continent_Region,
text = for_display))+
theme_cowplot() +
geom_point(alpha = 0.6) + Color_Continent +
labs(color = "Geographical group",
x = "Percentage of TE reads", y = "RIP composite median",
title = "Amount of transposable elements vs RIP level") +
geom_point(data = temp, aes(as.numeric(TE_qty),
as.numeric(Composite_median),
color = Continent_Region), size = 5)
ggplotly(t)
t
bias1 = TE_RIP %>% ggplot(aes(Percent_TE_Reads, as.numeric(Composite_median), color =Collection, text = for_display)) +
theme_cowplot() +
geom_point(alpha = 0.8)
#bias2 = TE_RIP %>% ggplot(aes(Percent_TE_Reads, as.numeric(Composite_median), color =Library_strategy, text = for_display)) +
# theme_cowplot() +
# geom_point(alpha = 0.8)
ggplotly(bias1)
cowplot::plot_grid(RIP_plot +
labs (title = "", subtitle = "") +
geom_text(data = CLD_RIP, aes(x = Continent_Region,
y = text_y,
label = .group),
color = "black"),
TE_prop +
labs (title = "", subtitle = "") +
geom_text(data = CLD, aes(x = Continent_Region,
label = .group,
y = 34),
color = "black") +
theme(legend.position = "none") +
coord_flip())
subset = TE_RIP %>%
filter(Collection != "Hartmann")
temp = subset %>%
group_by(Continent_Region) %>%
dplyr::summarize(TE_qty = mean(Percent_TE_Reads, na.rm = T),
Composite_median = mean(as.numeric(Composite_median), na.rm = T)) %>%
dplyr::mutate(for_display = Continent_Region)
t = ggplot(subset, aes(as.numeric(Percent_TE_Reads),
as.numeric(Composite_median),
color = Continent_Region,
text = for_display)) +
theme_cowplot() +
geom_point(alpha = 0.6) + Color_Continent +
labs(color = "Geographical group",
x = "Percentage of TE reads", y = "RIP composite median",
title = "Amount of transposable elements vs RIP level") +
geom_point(data = temp, aes(as.numeric(TE_qty),
as.numeric(Composite_median),
color = Continent_Region), size = 5)
ggplotly(t)
Besides geographical origin and collection, I also wanted to check time for subsets of the genomes. The Oceanian samples have often shown their own pattern, so I looked at them separately. Additionally, I am curious about the North American samples since Ursula saw a temporal pattern.
#Checking the Oceanian samples
TE_RIP %>%
filter(Continent_Region == "Oceania") %>%
ggplot(aes(Percent_TE_Reads, as.numeric(Composite_median), color = as.character(Date_Year), shape = Country, text = for_display)) +
geom_point(alpha = 0.8) +
theme_cowplot() +
scale_color_manual(values = blues) +
labs(color = "Group",
x = "Percentage of TE reads", y = "RIP composite median",
title = "TE vs RIP level in Oceania",
subtitle = str_wrap(paste0("Oceanian samples have shown a clear ",
"temporal and geographical substructure ",
"based on SNPs. How about RIP/TE content?"),
width = 70))
t = TE_RIP %>%
filter(Country == "United States") %>%
ggplot(aes(Percent_TE_Reads, as.numeric(Composite_median),
color = Collection, shape = as.character(Date_Year))) +
geom_point(alpha = 0.8) +
theme_bw() +
labs(color = "Geographical group",
x = "Percentage of TE reads", y = "RIP composite median",
title = "TE vs RIP level in North America",
subtitle = str_wrap(paste0("Ursula's data shows a geographical and a ",
"temporal pattern in the TE content. ",
"Can this be recovered here?"),
width = 70))
t
The RIP index does seem consistent so far with what Cécile has found, with higher RIP in the Middle-East and African populations and lower in the rest (in particular North America and Oceania).
Careful, there is a strong difference between the data from the Hartmann dataset and the rest. Let’s investigate a potential GC bias in the sequencing.
#The coverage files here are produced by bedtools coverage with the hist option using the aligned bam files and a bed file describing 1kb windows.
# Example loop:
#for sample in STnnJGI_SRR4235066 STnnJGI_SRR4235068 STnnJGI_SRR4235067 STnnJGI_SRR4235068 ; do rsync -avP /legserv/NGS_data/Zymoseptoria/Aligned_reads_Nuc_Mito_genomes/SRA_2019/PE/Bam/${sample}.bam ./ ; ~/Software/bedtools coverage -a Zymoseptoria_tritici.MG2.dna.toplevel.mt+.fa.1kb.bed -b ${sample}.bam -hist > ${sample}.coverage.txt ; done
#And gathered like this:
#for file in *.coverage.txt ; do sample=$(echo $file | sed 's/.coverage.txt//') ; echo $sample ; awk -v #var=$sample '{print var, $1, $2, $3, $4, $5, $6, $7}' $file ; done > Coverage_in_windows.txt
depth = read_tsv(paste0(TE_RIP_dir, "Coverage_in_windows.txt"),
col_names = c("Sample", "Chr", "Start", "Stop",
"Depth", "Nb_bases_with_depth",
"Size", "Fraction_covered")) %>%
mutate(Sum_bases = Depth * Nb_bases_with_depth) %>%
group_by(Sample, Chr, Start, Stop) %>%
summarize(Nb_bases = sum(Nb_bases_with_depth), Sum_depth = sum(Sum_bases)) %>%
mutate(mean_depth = Sum_depth / Nb_bases)
write_tsv(depth, path = paste0(TE_RIP_dir, "Coverage_in_windows_summary.txt"))
suffix = ".depth_per_window.txt"
files <- list.files(paste0(VAR_dir,"1_Depth_per_window/"), pattern = paste0(suffix, "$"))
smaller_files = sample(files, size = 0, replace =F)
smaller_files = sample(files[grepl("ORE", files)], size = 10, replace =F)
smaller_files[length(smaller_files) + 1] = paste0("ST90ORE_a12_3B_6.chr_1.corrected", suffix)
smaller_files[length(smaller_files) + 1] = paste0("Ukraine_1995_ST95UR_F1c_2", suffix)
smaller_files[length(smaller_files) + 1] = paste0("Chile_1995_STCH95_1G3a_06", suffix)
smaller_files[length(smaller_files) + 1] = paste0("Indiana_1993_I24a_1", suffix)
smaller_files[length(smaller_files) + 1] = paste0("Indiana_1993_I25a_1", suffix)
smaller_files[length(smaller_files) + 1] = paste0("Indiana_1993_I33a_1", suffix)
smaller_files[length(smaller_files) + 1] = paste0("Israel_1992_ISYB1b", suffix)
smaller_files[length(smaller_files) + 1] = paste0("Israel_1992_ISZH2a", suffix)
depth_per_locus = data.frame()
for (i in smaller_files) {
file_name=paste0(VAR_dir,"1_Depth_per_window/", i)
sample_name = dplyr::last(str_split(file_name, "/")[[1]]) %>%
str_replace(., pattern = suffix, replacement = "")
temp = read_tsv(file_name, col_names = c("Locus_name", "Depth")) %>%
mutate(ID_file = sample_name)
depth_per_locus = bind_rows(depth_per_locus, temp)
}
median_depth_cor_per_ind = depth_per_locus %>%
filter(!grepl("\\[", Locus_name)) %>%
separate(col = Locus_name, into = c("ignore", "chr", "start", "end")) %>%
filter(as.numeric(chr) <= 13) %>%
group_by(ID_file) %>%
dplyr::summarise(Median_core_depth = median(as.numeric(Depth), na.rm = T))
depth_per_locus = left_join(depth_per_locus, median_depth_cor_per_ind) %>%
left_join(., readxl::read_excel(paste0(metadata_dir, "Zt_global_data_set_09April2020_DC_AFperso.xlsx"),
sheet = 1, n_max = 1000)) %>%
mutate(Normalized_depth = Depth / Median_core_depth) %>%
filter(Median_core_depth > 10)
#depth = read_tsv(paste0(TE_RIP_dir, "Coverage_in_windows_summary.txt"))
#depth per locus coming from the PAV from pangenome
#t = read_tsv(paste0(TE_RIP_dir, "Zymoseptoria_tritici.MG2.dna.toplevel.mt+.nucl_content")) %>%
# unite("#1_usercol", "2_usercol", "3_usercol", col = "Locus_name", remove = F) %>%
# ggplot(aes(`5_pct_gc`)) + geom_density()
Nucl_content = read_tsv(paste0(data_dir, "all_19_pangenome.windows.nuc_GC.tab")) %>%
unite("#1_usercol", "2_usercol", "3_usercol", col = "Locus_name", remove = F)
scale_this <- function(x){
(x - mean(x, na.rm=TRUE)) / sd(x, na.rm=TRUE)
}
GC_coverage = inner_join(depth_per_locus, Nucl_content,
by = "Locus_name") %>%
filter(!grepl("\\[", Locus_name)) %>%
separate(col = Locus_name, into = c("ignore", "chr", "start", "end")) %>%
filter(as.numeric(chr) <= 4) %>% #Check GC bias only on core chromosomes to limit accessory bias in coverage
mutate(scaled_depth = scale_this(Depth))
getAlpha<-function(i){
#print(i)
temp = GC_coverage %>%
filter(ID_file == i) %>%
mutate(log_depth = log(Depth + 1))
linearMod <- lm(log_depth ~ `5_pct_gc`, data=temp) # build linear regression model on full data
alpha = linearMod$coefficients["`5_pct_gc`"]
return (alpha)}
v_getAlpha <- Vectorize(getAlpha)
GC_bias = GC_coverage %>%
dplyr::group_by(ID_file) %>%
dplyr::summarise(Median_core_depth = mean(Median_core_depth, na.rm = T),
Median_depth = mean(Depth, na.rm = T)) %>%
dplyr::mutate(GC_bias_slope = v_getAlpha(ID_file))
temp = GC_coverage %>%
filter(chr == 1) %>%
mutate(GC_percent = round(`5_pct_gc` * 100, 0)) %>%
mutate(Date_Year = ifelse(is.na(Date_Year), "corrected", Date_Year)) %>%
group_by(ID_file, GC_percent, Date_Year) %>%
dplyr::summarize(Average_depth = mean(Depth, na.rm = T),
Window_count = n())
p1 = temp %>%
ggplot(aes(x = GC_percent, y = Average_depth, color = ID_file, linetype = Date_Year)) +
geom_line() + theme_bw() + geom_hline(yintercept = 3) +
theme(legend.position = "top") +
xlim(c(20, 70)) +
labs(x = "GC percent of the window",
y = str_wrap("Average depth per window of a given GC percent", width = 20))
p2 = temp %>%
dplyr::filter(ID_file == unique(temp$ID_file[[1]])) %>%
ggplot(aes(GC_percent, Window_count)) + geom_bar(stat = "identity") +
theme_bw() +
labs(x = "GC percent of the window",
y = str_wrap("Number of windows of a given GC percent", width = 20))
p3 = read_tsv(paste0(RIP_DIR, "GC_percent.txt"), col_names = c("ID_file", "TE", "GC_median", "GC_mean")) %>%
inner_join(readxl::read_excel(paste0(metadata_dir, "Zt_global_data_set_09April2020_DC_AFperso.xlsx"),
sheet = 1, n_max = 1000)) %>%
filter(Continent_Region == "North America") %>%
ggplot(aes(GC_median, fill = Collection, color = Collection)) +
geom_density(alpha = 0.4) +
theme_bw() +
labs(x = "Median GC content of the reads aligning on TEs") +
xlim(c(20, 70)) +
theme(legend.position = "bottom")
plot_grid(p1, p2, p3, nrow = 3, ncol = 1, rel_heights = c(1, 0.5, 0.7))
I’ve created a new representation of the GC bias and the first test for the correction.
For the top plot, I have sorted the windows of chromosome 1 in GC percent bins (1% bins). For each bin and for each sample, I then measure the average depth for all windows of the corresponding GC percent and this is what is plotted here. In the middle, you can see an histogram of the number of windows for each bin and, at the bottom, is the median GC content of reads aligning on TEs which I use as a proxy to estimate the GC content of TEs in general.
As we have seen before, there is a severe GC bias in the older genomes: in the windows with a GC content similar to TE reads (42 - 47), the coverage is often below 3 (black horizontal line). I know that you had tested the effect of depth on your method, Ursula, but this is quite an extreme difference between around 20 to 3… Perhaps this could be tested with a manually GC-biased sample.
I have tried the correction on one sample. You can see it in the top panel as a dashed purple line. It really does correct! Obviously the most extreme values don’t get corrected because 0 reads is 0 reads… But it’s not as bad as I expected honestly!
I did also try the in silico sequencing with GC bias but I could find only one software which claimed to include GCbias in simulated reads and it really did not go as extreme as we needed, unfortunately.
# Selection a smaller number of samples to illustrate the GC bias
temp = sample(GC_coverage %>% pull(ID_file) %>% unique(), size = 9, replace =F)
GC_coverage %>%
filter(ID_file %in% temp) %>%
ggplot(aes(`5_pct_gc`, scaled_depth)) +
geom_hex(bins = 70) +
scale_fill_continuous(type = "viridis") +
theme_bw() + facet_wrap(~ID_file, scale = "free") +
geom_smooth(method = "lm", se = FALSE,
color = "#29AF7F", size= 0.7) +
labs(x = "GC percent per window",
y = "Depth per window",
title = "Illustration of the GC bias",
subtitle = "I use the slope of a linear model to infer the GC bias.")
#The following block is silenced.
#It was meant to check whether I am indeed getting the same results with the model and the plot.
silence = 'i=temp[1]
temp = GC_coverage %>%
filter(ID_file == i) %>%
mutate(log_depth = scaled_depth)
linearMod <- lm(log_depth ~ `5_pct_gc`, data=temp) # build linear regression model on full data
p = GC_coverage %>%
filter(ID_file == i) %>%
ggplot(aes(`5_pct_gc`, scaled_depth)) +
geom_hex(bins = 70) +
scale_fill_continuous(type = "viridis") +
theme_bw() + facet_wrap(~ID_file, scale = "free") +
geom_smooth(method = "lm", se = FALSE, color = "#29AF7F")
p + geom_abline(intercept = linearMod$coefficients["(Intercept)"], slope = linearMod$coefficients["`5_pct_gc`"]) +
ylim(c(-6, 15))'
# Checking on a possible link between depth and GC bias
p1 = median_depth_cor_per_ind %>%
ggplot(aes(Median_core_depth)) +
geom_density(fill = mycolorsCorrel[7], col = mycolorsCorrel[1]) +
theme_bw() +
labs(x = "Median depth on core chromosmome")
p2 = ggplot(GC_bias, aes(GC_bias_slope)) + geom_density()+
geom_density(fill = myColors[1], col = myColors[1], alpha = 0.8) +
theme_bw()+
labs(x = "GC bias")
top = cowplot::plot_grid(p1, p2, labels = c("A", "B"))
p3 = inner_join(GC_bias, TE_RIP, by = "ID_file") %>%
ggplot(aes( Median_core_depth, GC_bias_slope, text = for_display, color = Collection)) +
geom_point() + theme_light() +
labs(x = "Median depth on core chromosmome",
y = "GC bias")
cowplot::plot_grid(top, p3, ncol = 1, nrow = 2, labels = c("", "C"))
# And now let's investigate if there is a correlation between GC bias and the values of interest from before
temp = inner_join(GC_bias, TE_RIP, by = "ID_file") %>%
pivot_longer(cols = c(Composite_median, Percent_TE_Reads), names_to = "Confounded estimate", values_to = "Estimated value") %>%
ggplot(aes(`Estimated value`, GC_bias_slope, text = for_display, color = Collection)) +
geom_point() + theme_light() +
facet_wrap(vars(`Confounded estimate`), scales = "free") +
labs(x = "",
y = "GC bias (slope of the per window estimate)",
title = "Effect of GC bias on TE and RIP estimation")
ggplotly(temp)
t = TE_RIP %>%
ggplot(aes(Percent_TE_Reads, as.numeric(Composite_median), color = Collection,
shape = as.character(Date_Year), text = for_display)) +
geom_point(alpha = 0.8) +
theme_bw()
ggplotly(t)
I tried to correct the GC bias by using deeptools correctGCBias on one sample from the Oregon population. I then rerun the PCA for the TE content. Although the individual values for the depth did change, it did not change the place of the sample in the PCA.
I don’t know if the GC bias is the reason for the clear difference between the Hartmann dataset or if the two are consequence of something else. Either way, it does not seem reasonable to compare this dataset to the rest. It seems that, inside of this dataset, the observed differences match with what can be inferred from the rest of the samples.
RIP_in_high_copies_TE = read_tsv(paste0(TE_RIP_dir, "All_genomes.all_TEs.GC.RIP.tab"),
col_names = c("Sample", "Seq_ID", "GC", "Product_index", "Substrate_index", "Composite_index"))
temp = RIP_in_high_copies_TE %>%
separate(Seq_ID, into = c("Sample", "Chrom", "Start", "End"),
sep = "\\.|-|:", remove = F) %>%
dplyr::mutate(Start = as.integer(Start), End = as.integer(End)) %>%
dplyr::mutate(Coord = (Start + End)/2)
#temp %>%
# filter(Chrom == "chr_4") %>%
# filter(Composite_index <= 2 & Composite_index >= - 2 )%>%
# ggplot(aes(Coord, Composite_index, col = Composite_index)) +
# geom_point() +
# facet_wrap (vars(Sample), ncol = 1) +
# theme_bw() +
# scale_color_viridis_c()
temp %>%
dplyr::mutate(Nb = str_pad(round((Coord / 100000), 0), 6, pad = "0")) %>%
filter(Composite_index >= -2 & Composite_index<= 2) %>%
#filter(Chrom == "chr_1") %>%
unite(Window, Chrom, Nb) %>%
group_by(Sample, Window) %>%
dplyr::summarize(Composite = mean(Composite_index, na.rm = T)) %>%
ggplot(aes(Window, Sample, fill = Composite)) +
geom_tile() +
theme_bw() +
scale_fill_viridis_c()
temp %>%
filter(Composite_index >= -3 & Composite_index<= 3) %>%
mutate(Chr = as.integer(str_replace(Chrom, "chr_", ""))) %>%
group_by(Sample, Chr) %>%
dplyr::summarize(Composite = mean(Composite_index, na.rm = T)) %>%
ggplot(aes(Chr, Sample, fill = Composite)) +
geom_tile() +
theme_bw() +
scale_fill_viridis_c()
Next step will be to check if dim2 is present where the RIP values suggest they are: intact copies in the Middle-East and Africa and absence/degeneration in the rest. Here, I use de novo genome assemblies and try to identify copies of dim2. For this, I use a deRIPped sequence of dim2 in the reference, blasted it on to Zt10 to get the sequence of Zt10_dim2 since it is known to have an intact sequence (see Mareike’s paper). I then compare this sequence (blastn) with de de novo assemblies to pull all the copies and identify the highest identity score.
#Here the Zt10_dim2_from_MgDNMT_deRIP.fa is the sequence of Zt10_6.417 which corresponds to the deRIPPed version of dim2 in the reference IPO323 https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2940312/pdf/GEN186167.pdf
dir_source=/legserv/NGS_data/Zymoseptoria/Data_files/Illumina_Denovo/Hartmann_2017_Denovo/
list_spades=$(ls $dir_source)
out_file=/data2/alice/WW_project/4_TE_RIP/1_Ztdim2_detection/1_Blast_from_denovo_assemblies/Results_blast_dim2.txt
echo "sample qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore" > $out_file
for x in $list_spades ;
do
sample=$(echo $x | sed 's/\.R/_R/' | sed 's/\.S\./_S\./' | cut -d '.' -f 3) ; echo $sample $x ;
rsync -avP \
${dir_source}${x}/scaffolds.fasta \
/data2/alice/WW_project/4_TE_RIP/1_Ztdim2_detection/1_Blast_from_denovo_assemblies/
python2 /userhome/alice/Scripts/Rename_fragments_in_fasta.py \
-i /data2/alice/WW_project/4_TE_RIP/1_Ztdim2_detection/1_Blast_from_denovo_assemblies/scaffolds.fasta \
-o /data2/alice/WW_project/4_TE_RIP/1_Ztdim2_detection/1_Blast_from_denovo_assemblies/${sample}.fasta \
--simple -f spades
~/Software/ncbi-blast-2.10.0+/bin/makeblastdb \
-dbtype nucl \
-in /data2/alice/WW_project/4_TE_RIP/1_Ztdim2_detection/1_Blast_from_denovo_assemblies/${sample}.fasta \
-out /data2/alice/WW_project/4_TE_RIP/1_Ztdim2_detection/1_Blast_from_denovo_assemblies/${sample}
~/Software/ncbi-blast-2.10.0+/bin/blastn \
-query /userhome/alice/WW_project/WW_TE_RIP/Zt10_dim2_from_MgDNMT_deRIP.fa \
-db /data2/alice/WW_project/4_TE_RIP/1_Ztdim2_detection/1_Blast_from_denovo_assemblies/${sample} \
-outfmt 6 | \
awk -v sample="${sample}" 'BEGIN {OFS = " "} {print sample, $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12}' >> \
$out_file
done
while read sample ;
do
out_file=/data2/alice/WW_project/4_TE_RIP/1_Ztdim2_detection/1_Blast_from_denovo_assemblies/Results_blast_dim2_${sample}.txt
#Software directories
BWAPATH=/userhome/alice/Software/bwa-0.7.17/
BEDTOOLS_PATH=/userhome/alice/Software/
SAMTOOLS_PATH=/userhome/alice/Software/samtools-1.10/
BOWTIE_PATH=/userhome/alice/Software/bowtie2-2.4.1-linux-x86_64/
SCRIPTS_PATH=/userhome/alice/Scripts/
REF_NAME=/userhome/alice/WW_project/WW_TE_RIP/Badet_BMC_Biology_2020_TE_consensus_sequences
SPADES_PATH=/userhome/alice/Software/SPAdes-3.14.1-Linux/bin/
BLAST_PATH=/userhome/alice/Software/ncbi-blast-2.10.0+/bin/
# Files directories
DATA_DIR=/data3/alice/WW_project/Data/
WWTERIP_DIR=/data3/alice/WW_project/WW_TE_RIP/
RIP_DIR=${WWTERIP_DIR}0_RIP_estimation/
RIP_raw0=${RIP_DIR}0_BAM_temp/
RIP_raw1=${RIP_DIR}1_Fastq_from_bam/
RIP_aln=${RIP_DIR}2_Aln_TE_consensus/
RIP_est=${RIP_DIR}3_RIP_estimation/
DIM2_DIR=${WWTERIP_DIR}1_Ztdim2_detection/1_Blast_from_denovo_assemblies/
DIM_denovo=${DIM2_DIR}0_Spades/
DIM_blast=${DIM2_DIR}1_Blast_dim2_deRIPped/
#Getting fastq reads
${SAMTOOLS_PATH}samtools sort \
-n \
-o ${DATA_DIR}${sample}.sorted.bam \
${DATA_DIR}${sample}.bam
rm ${DATA_DIR}${sample}.bam
${BEDTOOLS_PATH}bedtools bamtofastq \
-i ${DATA_DIR}${sample}.sorted.bam \
-fq ${DATA_DIR}${sample}_R1.fq \
-fq2 ${DATA_DIR}${sample}_R2.fq
#De novo assembly
rm -r ${DIM_denovo}${sample}
${SPADES_PATH}spades.py \
-o ${DIM_denovo}${sample} \
--careful \
-1 ${DATA_DIR}${sample}_1.fq.gz \
-2 ${DATA_DIR}${sample}_2.fq.gz
#
python2 ${SCRIPTS_PATH}Rename_fragments_in_fasta.py \
-i ${DIM_denovo}${sample}scaffolds.fasta \
-o ${DIM_blast}${sample}.fasta \
--simple -f spades
${BLAST_PATH}makeblastdb \
-dbtype nucl \
-in ${DIM_blast}${sample}.fasta \
-out ${DIM_blast}${sample}
${BLAST_PATH}blastn \
-query /userhome/alice/WW_project/WW_TE_RIP/Zt10_dim2_from_MgDNMT_deRIP.fa \
-db ${DIM_blast}${sample} \
-outfmt 6 | \
awk -v sample="${sample}" 'BEGIN {OFS = " "} {print sample, $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12}' >> \
$out_file
done
Let’s look at the results as dot plots and compare the results from the dim2 blast to the RIP composite index. So far the Middle-Eastern samples seem quite similar to one another, whereas other regions contain way more variability such as Europe.
In order to identify the native dim2 copy in genomes, I look for several things:
system(paste0("cat ", DIM2_DIR, "Results_blast_dim2*txt > ", DIM2_DIR, "Overall_results_blast_dim2.txt"))
length_dim2 = 3846
length_flank1 = 3689
length_flank2 = 1222
threshold_length_dim2 = 0.8 * length_dim2
threshold_length_flank1 = 0.8 * length_flank1
threshold_length_flank2 = 0.8 * length_flank2
dim2_blast_results_complete = read_delim(paste0(DIM2_DIR, "Overall_results_blast_dim2.txt"), delim = " ",
col_names = c("sample", "gene", "qseqid", "sseqid", "pident", "length",
"mismatch", "gapopen", "qstart", "qend",
"sstart", "send", "evalue", "bitscore"))
flank1 = dim2_blast_results_complete %>%
filter(gene == "dim2_flank1") %>%
dplyr::select(sample, gene, sseqid, length, sstart, send) %>%
dplyr::mutate(midcoord_flank1 = (sstart + send)/2) %>%
filter(length > threshold_length_flank1) %>%
pivot_wider(names_from = gene, values_from = sseqid) %>%
dplyr::select(-length, -sstart, -send) %>%
group_by(sample) %>%
dplyr::mutate(nb_gene = n())
flank2 = dim2_blast_results_complete %>%
filter(gene == "dim2_flank2") %>%
dplyr::select(sample, gene, sseqid, length, sstart, send) %>%
dplyr::mutate(midcoord_flank2 = (sstart + send)/2) %>%
filter(length > threshold_length_flank2) %>%
pivot_wider(names_from = gene, values_from = sseqid) %>%
dplyr::select(-length, -sstart, -send) %>%
group_by(sample) %>%
dplyr::mutate(nb_gene = n())
#First let's extract some information such as finding the middle coordinates for the
# 3 genes of interest
dim2_blast_results = dim2_blast_results_complete %>%
dplyr::filter(gene == "dim2") %>%
full_join(., flank1, by = "sample") %>%
full_join(., flank2, by = "sample") %>%
dplyr::mutate(dim2_flank1 = replace_na(dim2_flank1, "Not_found"),
dim2_flank2 = replace_na(dim2_flank2, "Not_found"),
midcoord_flank1 = replace_na(midcoord_flank1, "0"),
midcoord_flank2 = replace_na(midcoord_flank2, "0")) %>%
dplyr::mutate(midcoord_flank1 = as.numeric(midcoord_flank1),
midcoord_flank2 = as.numeric(midcoord_flank2)) %>%
dplyr::mutate(ave_coord = (sstart + send)/2) %>%
dplyr::mutate(min_fl = ifelse(midcoord_flank1 > midcoord_flank2,
midcoord_flank2, midcoord_flank1)) %>%
dplyr::mutate(max_fl = ifelse(midcoord_flank1 > midcoord_flank2,
midcoord_flank1, midcoord_flank2))
#Now, let's compare the blast results of dim2 with the flanking genes
# (contig name and distance)
dim2_blast_results = dim2_blast_results %>%
dplyr::mutate(is_same_contig = ifelse(sseqid == dim2_flank1 & sseqid == dim2_flank2,
"Both",
ifelse(sseqid != dim2_flank1 & sseqid != dim2_flank2,
"None",
ifelse(sseqid == dim2_flank1, "Flank1",
ifelse(sseqid == dim2_flank2, "Flank2", "What"))))) %>%
dplyr::mutate(distance = ifelse(is_same_contig == "None", "No_distance",
ifelse(is_same_contig == "Both",
ifelse(ave_coord >= min_fl &
ave_coord <= max_fl ,
"Distance_OK", "Not_between"),
ifelse(is_same_contig == "Flank1",
ifelse((abs(midcoord_flank1 - ave_coord) <= 10000),
"Distance_OK", "Too_far"),
ifelse((abs(midcoord_flank2 - ave_coord) <= 10000),
"Distance_OK", "Too_far"))))) %>%
mutate(Nb_flanking_found_2cat = ifelse(is_same_contig == "None", "0",
ifelse(distance == "Distance_OK", ">1", "0")))
#dplyr::select(sample, sseqid, length, dim2_flank1, dim2_flank2, is_same_contig) %>%
dim2_blast_results %>%
ggplot(aes(length)) +
geom_density(alpha = 0.7) +
theme_bw() +
geom_vline(aes(xintercept = threshold_length_dim2), color = "gray70", size = 0.6) +
labs(x = "Length (bp)",
y = "Density",
title = "Length of all blast matches against Zt10dim2",
subtitle = str_wrap(paste("There is a large range in the size of the matches.",
" In order to ignore very short matches, I could set up a threshold",
" visualized here as the grey line."),
width = 70))
However, the length also greatly varies for copies for which we were able to detect one or two of the flanking genes on the same contig.
p1 = dim2_blast_results %>%
ggplot(aes(length, fill = is_same_contig, color = is_same_contig)) +
geom_density(alpha = 0.7) +
theme_bw() +
labs(x = "Length (bp)",
y = "Density",
title = str_wrap("Length of all blast matches against Zt10dim2", width = 30))
p3 = dim2_blast_results %>%
ggplot(aes(Nb_flanking_found_2cat, fill = is_same_contig)) +
geom_bar(alpha = 0.7) +
theme_bw() +
labs ( x = "Number of flanking genes on the same contig",
y = "Number of copies",
title = str_wrap("Numbers of copies found in the same contig as the flanking genes ", width = 30)) +
theme(legend.position = "none")
legend_b <- get_legend(
p1 +
guides(color = guide_legend(nrow = 1)) +
theme(legend.position = "bottom")
)
prow <- plot_grid(
p1 + theme(legend.position="none"),
p3 + theme(legend.position="none"),
align = 'vh',
labels = c("A", "B"),
hjust = -1,
nrow = 1)
plot_grid(prow, legend_b, ncol = 1, rel_heights = c(1, .1))
Now, I will select only the copies which have a least one flanking gene found on the same contig. I consider these the “native” copy of the gene. And I will then make some plots using only these original copies to investigate the geographical distribution of both length and identity with the dim2 copy of Zt10.
temp = dim2_blast_results %>%
group_by(sample) %>%
dplyr::summarize(n_matches = n(),
n_long_matches = sum(length > 1000))
sum_dim2_blast = dim2_blast_results %>%
dplyr::filter(Nb_flanking_found_2cat == ">1") %>%
group_by(sample) %>%
dplyr::summarize(length_sum = sum(length),
pident_wm = weighted.mean(pident, length),
n_matches_on_native_contigs = n()) %>%
filter(length_sum < length_dim2 + 200) %>%
inner_join(., temp) %>%
inner_join(., RIP %>% filter(TE == "RIP_est"), by = c("sample" = "ID_file"))
#Plots of identity vs length of the original copy
scatter <- sum_dim2_blast %>%
ggplot(aes(length_sum, pident_wm,
color = Continent_Region,
shape = as.character(n_matches_on_native_contigs))) +
geom_point() +
theme_bw() + Color_Continent +
theme(legend.position = "none") +
labs( x = "Length of the recovered native copy",
y = "Identity with Zt10dim2")
hist_top = sum_dim2_blast %>%
filter(Continent_Region != "Asia") %>%
ggplot(aes(Continent_Region, length_sum,
fill = Continent_Region, color = Continent_Region)) +
geom_boxplot(alpha = 0.6, width = 0.4) +
theme_bw() +
Fill_Continent + Color_Continent+
theme(legend.position = "none",
axis.text.x = element_text(size = 7),
axis.text.y = element_text(size = 7)) +
coord_flip() +
labs( x = "", y = "")
hist_right = sum_dim2_blast %>%
filter(Continent_Region != "Asia") %>%
ggplot(aes(Continent_Region, pident_wm, fill = Continent_Region)) +
geom_violin(alpha = 0.6) +
theme_bw() + Fill_Continent +
theme(legend.position = "none",
axis.text.x = element_text(size = 7, angle = 20, hjust = 1)) +
labs( x = "", y = "")
#Options for top right plot
legend_b <- get_legend(
hist_top +
guides(color = guide_legend(nrow = 1)) +
theme(legend.position = "right")
)
empty <- ggplot()+geom_point(aes(1,1), colour="white")+
theme(axis.ticks=element_blank(),
panel.background=element_blank(),
axis.text.x=element_blank(), axis.text.y=element_blank(),
axis.title.x=element_blank(), axis.title.y=element_blank())
#Gather all
cowplot::plot_grid(hist_top, legend_b, scatter, hist_right,
ncol=2, nrow=2,
rel_widths=c(1.2, 1), rel_heights=c(1, 1),
align = 'vh')
sum_dim2_blast %>%
filter(Continent_Region != "Asia") %>%
dplyr::mutate(Nb_frag = as.character(n_matches_on_native_contigs)) %>%
dplyr::count(Nb_frag, Collection, Continent_Region) %>%
ggplot(aes(Collection, Continent_Region, color = Collection)) +
geom_point(aes(size = n), stat = "identity", alpha = 0.6) +
theme_bw() +
theme(legend.position = "none",
axis.text.x = element_text(size = 7, angle = 20, hjust = 1)) +
labs( x = "", y = "") +
Fill_Continent +
facet_wrap(vars(Nb_frag))
Here a large proportion of the Middle-Eastern samples from the old Hartmann dataset have two matches on the “right” contigs: this is due to a deletion found in one of the allele of dim2 (shown in Mareike’s new version of her dim2 manuscript).
Let’s investigate quickly how many long copies there are in each genome. I’m also interested in the native match as related to RIP and geography.
p1 = sum_dim2_blast %>%
ggplot(aes(as.numeric(Composite_median), pident_wm, color = Continent_Region,
shape = Collection))+
geom_point() + Color_Continent +
theme_bw() + theme(legend.position = "none") +
labs(x = str_wrap(paste("RIP composite index",
" (median per isolate)"),
width = 20),
y = str_wrap("Identity of the native match",
width = 20))
p2 = sum_dim2_blast %>%
ggplot(aes(as.numeric(Composite_median), n_long_matches, color = Continent_Region,
shape = Collection))+
geom_point() + Color_Continent +
theme_bw() + theme(legend.position = "none") +
labs(x = str_wrap(paste("RIP composite index",
" (median per isolate)"),
width = 20),
y = str_wrap("Number of long blast matches (above 1kb)",
width = 20))
p3 = sum_dim2_blast %>%
ggplot(aes(x = pident_wm, y = n_long_matches, color = Continent_Region,
shape = Collection)) +
geom_point() + Color_Continent + theme_bw()+
labs(x = str_wrap("Identity of the native match",
width = 20),
y = str_wrap("Number of long blast matchess (above 1kb)",
width = 20),
color = "Geographical group") +
theme(axis.title=element_text(size=10))
bottom_row <- cowplot::plot_grid(p1, p2, labels = c('B', 'C'), label_size = 12)
cor.test(sum_dim2_blast$pident_wm, sum_dim2_blast$Composite_median, method="spearman")
##
## Spearman's rank correlation rho
##
## data: sum_dim2_blast$pident_wm and sum_dim2_blast$Composite_median
## S = 12298290, p-value = 1.104e-08
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.2613409
cor.test(sum_dim2_blast$n_long_matches, sum_dim2_blast$Composite_median, method="spearman")
##
## Spearman's rank correlation rho
##
## data: sum_dim2_blast$n_long_matches and sum_dim2_blast$Composite_median
## S = 16448207, p-value = 0.7951
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.01208886
cor.test(sum_dim2_blast$pident_wm, sum_dim2_blast$n_long_matches, method="spearman")
##
## Spearman's rank correlation rho
##
## data: sum_dim2_blast$pident_wm and sum_dim2_blast$n_long_matches
## S = 20238861, p-value = 2.779e-06
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## -0.2155852
cowplot::plot_grid(p3, bottom_row, labels = c('A', ''), label_size = 12, ncol = 1)
p1 = sum_dim2_blast %>%
filter(Collection == "Hartmann") %>%
ggplot(aes(as.numeric(Composite_median), pident_wm, color = Continent_Region))+
geom_point() + Color_Continent +
theme_bw() + theme(legend.position = "none") +
labs(x = str_wrap(paste("RIP composite index",
" (median per isolate)"),
width = 20),
y = str_wrap("Identity of the native match",
width = 20))
p2 = sum_dim2_blast %>%
filter(Collection != "Hartmann") %>%
ggplot(aes(as.numeric(Composite_median), pident_wm, color = Continent_Region))+
geom_point() + Color_Continent +
theme_bw() + theme(legend.position = "none") +
labs(x = str_wrap(paste("RIP composite index",
" (median per isolate)"),
width = 20),
y = str_wrap("Identity of the native match",
width = 20))
cowplot::plot_grid(p1, p2, labels = c('A', 'B'), label_size = 12)
#Statistical test
#One-way ANOVA with blocks
##Define linear model
model = lm(as.numeric(pident_wm) ~ Continent_Region + Collection ,
data=sum_dim2_blast)
summary(model) ### Will show overall p-value and r-squared
##
## Call:
## lm(formula = as.numeric(pident_wm) ~ Continent_Region + Collection,
## data = sum_dim2_blast)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.7087 -1.0411 -0.1404 0.7935 7.7716
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 93.7009 1.0036 93.361 < 2e-16 ***
## Continent_RegionAsia -0.4458 1.5901 -0.280 0.7794
## Continent_RegionEurope -3.7012 0.6334 -5.843 1.16e-08 ***
## Continent_RegionMiddle East 2.9703 0.6731 4.413 1.36e-05 ***
## Continent_RegionNorth America -5.3291 0.6444 -8.270 2.72e-15 ***
## Continent_RegionOceania -2.9616 0.7423 -3.990 8.04e-05 ***
## Continent_RegionSouth America -5.3007 0.8134 -6.516 2.47e-10 ***
## CollectionC2 0.8472 0.9656 0.877 0.3809
## CollectionC3 0.3463 1.0660 0.325 0.7455
## CollectionHartmann 0.2642 0.8528 0.310 0.7569
## CollectionJGI 0.2275 0.8383 0.271 0.7863
## CollectionMM_NZ_TAS -1.9998 0.9509 -2.103 0.0362 *
## CollectionThird_batch_2018_BM_TM 0.3948 0.8651 0.456 0.6484
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.06 on 355 degrees of freedom
## (96 observations deleted due to missingness)
## Multiple R-squared: 0.6487, Adjusted R-squared: 0.6368
## F-statistic: 54.62 on 12 and 355 DF, p-value: < 2.2e-16
##Conduct analysis of variance
Anova(model,type = "II")
## Anova Table (Type II tests)
##
## Response: as.numeric(pident_wm)
## Sum Sq Df F value Pr(>F)
## Continent_Region 2285.1 6 89.7677 < 2.2e-16 ***
## Collection 119.6 6 4.6983 0.0001284 ***
## Residuals 1506.2 355
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(model)
##
## Call:
## lm(formula = as.numeric(pident_wm) ~ Continent_Region + Collection,
## data = sum_dim2_blast)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.7087 -1.0411 -0.1404 0.7935 7.7716
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 93.7009 1.0036 93.361 < 2e-16 ***
## Continent_RegionAsia -0.4458 1.5901 -0.280 0.7794
## Continent_RegionEurope -3.7012 0.6334 -5.843 1.16e-08 ***
## Continent_RegionMiddle East 2.9703 0.6731 4.413 1.36e-05 ***
## Continent_RegionNorth America -5.3291 0.6444 -8.270 2.72e-15 ***
## Continent_RegionOceania -2.9616 0.7423 -3.990 8.04e-05 ***
## Continent_RegionSouth America -5.3007 0.8134 -6.516 2.47e-10 ***
## CollectionC2 0.8472 0.9656 0.877 0.3809
## CollectionC3 0.3463 1.0660 0.325 0.7455
## CollectionHartmann 0.2642 0.8528 0.310 0.7569
## CollectionJGI 0.2275 0.8383 0.271 0.7863
## CollectionMM_NZ_TAS -1.9998 0.9509 -2.103 0.0362 *
## CollectionThird_batch_2018_BM_TM 0.3948 0.8651 0.456 0.6484
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.06 on 355 degrees of freedom
## (96 observations deleted due to missingness)
## Multiple R-squared: 0.6487, Adjusted R-squared: 0.6368
## F-statistic: 54.62 on 12 and 355 DF, p-value: < 2.2e-16
hist(residuals(model), col="darkgray")
#Post-hoc analysis: mean separation tests
marginal = lsmeans(model, ~ Continent_Region)
pairs(marginal, adjust="tukey")
## contrast estimate SE df t.ratio p.value
## Africa - Asia 0.4458 1.590 355 0.280 1.0000
## Africa - Europe 3.7012 0.633 355 5.843 <.0001
## Africa - Middle East -2.9703 0.673 355 -4.413 0.0003
## Africa - North America 5.3291 0.644 355 8.270 <.0001
## Africa - Oceania 2.9616 0.742 355 3.990 0.0016
## Africa - South America 5.3007 0.813 355 6.516 <.0001
## Asia - Europe 3.2554 1.505 355 2.164 0.3181
## Asia - Middle East -3.4161 1.532 355 -2.230 0.2822
## Asia - North America 4.8833 1.522 355 3.208 0.0243
## Asia - Oceania 2.5158 1.565 355 1.607 0.6777
## Asia - South America 4.8549 1.576 355 3.081 0.0358
## Europe - Middle East -6.6715 0.413 355 -16.140 <.0001
## Europe - North America 1.6279 0.371 355 4.388 0.0003
## Europe - Oceania -0.7396 0.505 355 -1.464 0.7660
## Europe - South America 1.5995 0.637 355 2.511 0.1584
## Middle East - North America 8.2994 0.384 355 21.628 <.0001
## Middle East - Oceania 5.9319 0.492 355 12.063 <.0001
## Middle East - South America 8.2710 0.684 355 12.088 <.0001
## North America - Oceania -2.3675 0.465 355 -5.093 <.0001
## North America - South America -0.0284 0.658 355 -0.043 1.0000
## Oceania - South America 2.3391 0.753 355 3.105 0.0334
##
## Results are averaged over the levels of: Collection
## Note: contrasts are still on the as.numeric scale
## P value adjustment: tukey method for comparing a family of 7 estimates
CLD_dim = cld(marginal,
alpha = 0.05,
Letters = letters, ### Use lower-case letters for .group
adjust = "tukey") ### Tukey-adjusted p-values
CLD_dim
## Continent_Region lsmean SE df lower.CL upper.CL .group
## North America 88.4 0.334 355 87.5 89.3 a
## South America 88.4 0.626 355 86.7 90.1 ab
## Europe 90.0 0.239 355 89.4 90.7 bc
## Oceania 90.8 0.424 355 89.6 91.9 c
## Asia 93.3 1.500 355 89.2 97.3 cde
## Africa 93.7 0.622 355 92.0 95.4 d
## Middle East 96.7 0.378 355 95.7 97.7 e
##
## Results are averaged over the levels of: Collection
## Results are given on the as.numeric (not the response) scale.
## Confidence level used: 0.95
## Conf-level adjustment: sidak method for 7 estimates
## Note: contrasts are still on the as.numeric scale
## P value adjustment: tukey method for comparing a family of 7 estimates
## significance level used: alpha = 0.05
CLD_dim$.group=gsub(" ", "", CLD_dim$.group)
text_y = (max(as.numeric(temp$Composite_median), na.rm = T) + 0.1*max(as.numeric(temp$Composite_median), na.rm = T))
And then as boxplots per continental region.
p1 = sum_dim2_blast %>%
group_by(Continent_Region) %>%
dplyr::mutate(avg_per_cont = mean(pident_wm, na.rm = T)) %>%
ggplot(aes(Continent_Region, pident_wm,
fill = Continent_Region)) +
geom_violin(aes(fill = Continent_Region), alpha = 0.5) +
geom_jitter(aes(col = Continent_Region), size = 0.8, alpha = 0.8, width = 0.2, height = 0.1) +
Fill_Continent + Color_Continent +
theme_light() +
theme(
panel.border = element_blank(),
axis.ticks.x = element_blank(),
legend.position = "none",
axis.text.x = element_text(size = 10, angle = 40, hjust = 1)) +
coord_flip() +
labs(x = NULL,
y = "Identity of the native copy to Zt10dim2") +
geom_text(data = CLD_dim, aes(x = Continent_Region,
y = 102,
label = .group), color = "black")
p2 = ggplot(sum_dim2_blast, aes(Continent_Region, n_long_matches,
fill = Continent_Region)) +
geom_violin() + Fill_Continent +
theme_light() +
theme(panel.grid.major.y = element_blank(),
panel.border = element_blank(),
legend.position = "none",
axis.text.x = element_text(angle = 40, hjust = 1))+
labs(x = NULL,
y = "Number of dim2 copies") + coord_flip()
cowplot::plot_grid(p1, p2, rel_widths = c(1, 1))
# TODO
# Use beginning and end of dim2 gene (100bp) to get gene boundaries on assemblies.
# Update: I tried. But it really does not work better...
#Here is the code of the comparison in case I need to use it later still
#Selecting start and end coordinates which are found on a contig
# with at least one of the flanking genes.
dim2_start = dim2_blast_results_complete %>%
filter(gene == "dim2_start") %>%
full_join(., flank1) %>%
full_join(., flank2) %>%
mutate(dim2_flank1 = replace_na(dim2_flank1, "Not_found"),
dim2_flank2 = replace_na(dim2_flank2, "Not_found")) %>%
mutate(is_same_contig = ifelse(sseqid == dim2_flank1 & sseqid == dim2_flank2,
"Both",
ifelse(sseqid != dim2_flank1 & sseqid != dim2_flank2,
"None",
ifelse(sseqid == dim2_flank1, "Flank1",
ifelse(sseqid == dim2_flank2, "Flank2", "What")))))%>%
mutate(Nb_flanking_found_2cat = ifelse(is_same_contig == "None", "0", ">1"))%>%
filter(Nb_flanking_found_2cat == ">1")
dim2_end = dim2_blast_results_complete %>%
filter(gene == "dim2_end") %>%
full_join(., flank1) %>%
full_join(., flank2) %>%
mutate(dim2_flank1 = replace_na(dim2_flank1, "Not_found"),
dim2_flank2 = replace_na(dim2_flank2, "Not_found")) %>%
mutate(is_same_contig = ifelse(sseqid == dim2_flank1 & sseqid == dim2_flank2,
"Both",
ifelse(sseqid != dim2_flank1 & sseqid != dim2_flank2,
"None",
ifelse(sseqid == dim2_flank1, "Flank1",
ifelse(sseqid == dim2_flank2, "Flank2", "What")))))%>%
mutate(Nb_flanking_found_2cat = ifelse(is_same_contig == "None", "0", ">1"))%>%
filter(Nb_flanking_found_2cat == ">1")
# Table from start/end
#full_join(dim2_start, dim2_end, by = "sample") %>%
# filter(complete.cases(.)) %>%
# select(sample, dim2_flank1.x, dim2_flank2.x,
# gene.x, sseqid.x, sstart.x, send.x, is_same_contig.x, Nb_flanking_found_2cat.x,
# gene.y, sseqid.y, sstart.y, send.y, is_same_contig.y, Nb_flanking_found_2cat.y) %>%
# left_join(., RIP %>% filter(TE == "RIP_est"), by = c("sample" = "ID_file")) %>%
# group_by(is_same_contig.x, Continent_Region) %>%
# dplyr::summarise(n()) %>%
# pivot_wider(names_from = Continent_Region, values_from = `n()`)
# Table from the "simple" blast as comparison
#dim2_blast_results %>%
# dplyr::filter(Nb_flanking_found_2cat == ">1") %>%
# left_join(., RIP %>% filter(TE == "RIP_est"), by = c("sample" = "ID_file")) %>%
# filter(length > 2500) %>%
# dplyr::count(is_same_contig, Continent_Region) %>%
# pivot_wider(names_from = Continent_Region, values_from = n)
#Keep only copies with:
# - both flanking genes
# - one flanking gene but a length of 2500 at least
dim2_blast_results %>%
dplyr::filter(Nb_flanking_found_2cat == ">1") %>%
left_join(., RIP %>% filter(TE == "RIP_est"), by = c("sample" = "ID_file")) %>%
filter(length > 2500) %>%
dplyr::count(is_same_contig, Continent_Region) %>%
dplyr::group_by(Continent_Region) %>%
dplyr::mutate(Somme_per_collection = sum(n)) %>%
mutate(prop = n*100/Somme_per_collection) %>%
ggplot(aes(x = is_same_contig, y = Continent_Region, fill = prop)) +
geom_tile() +
theme_bw() +
scale_fill_viridis_c() +
labs (title = str_wrap(paste0("Proportion of long dim2 matches ",
"with at least one flanking gene"),
width = 70))
#Write the bed file to extract the sequences
dim2_blast_results %>%
dplyr::filter(Nb_flanking_found_2cat == ">1") %>%
left_join(., RIP %>% filter(TE == "RIP_est"), by = c("sample" = "ID_file")) %>%
dplyr::filter(length > 2500) %>%
dplyr::mutate(start = ifelse(sstart > send, send, sstart),
end = ifelse(sstart > send, sstart, send)) %>%
dplyr::select(sample, sseqid, start, end) %>%
write_delim(paste0(TE_RIP_dir, "Coordinates_dim2_all_samples.bed"),
col_names = F)
#This command has to be run on the cluster
#while read sample chr start end ; do
# echo -e "${chr}\t${start}\t${end}" > temp.bed ;
# ~/Software/bedtools getfasta \
# -fi /data2/alice/WW_project/4_TE_RIP/1_Blast_from_denovo_assemblies/0_Spades/${sample}.fasta \
# -bed temp.bed | sed "s/>/>${sample}:/" >> \
# /data2/alice/WW_project/4_TE_RIP/1_Blast_from_denovo_assemblies/1_Blast_dim2_deRIPped/Native_dim2_all_samples.fasta ;
#done < /data2/alice/WW_project/4_TE_RIP/1_Blast_from_denovo_assemblies/1_Blast_dim2_deRIPped/Coordinates_dim2_all_samples.bed
#Run through the website because laziness
# mafft --thread 8 --threadtb 5 --threadit 0 --reorder --adjustdirection --auto input > output
For the following tree, I used the gene alignment from mafft (online) and created a neighbor-joining tree. The gene sequence from Z. passerinii was used as the outgroup sequence. Then, I attempt to represent both the geographical origin or the samples and the RIP level.
#Prep data to add to tree
temp2 = dim2_blast_results %>%
dplyr::filter(Nb_flanking_found_2cat == ">1") %>%
left_join(., RIP %>% filter(TE == "RIP_est"), by = c("sample" = "ID_file")) %>%
dplyr::filter(length > 2500) %>%
dplyr::mutate(start = ifelse(sstart > send, send, sstart),
end = ifelse(sstart > send, sstart, send)) %>%
unite(coord, start, end, sep = "-") %>%
dplyr::mutate(no_dot = stringr::str_replace(sseqid, "\\.", "_"))%>%
unite(contig2, sample, no_dot, coord, sep = "_", remove = F)
#Read tree in
#details from the mafft website about the tree
#Size = 237 sequences × 1214 sites
#Method = Neighbor-Joining
#Model = Jukes-Cantor
#Bootstrap resampling = 100
tree = as_tibble(treeio::read.tree("/Users/feurtey/Documents/Postdoc_Bruce/Projects/WW_project/4_TE_RIP/1_Blast_from_denovo_assemblies/Essai_dim2_tree.nwk")) %>%
mutate(label_copy = label) %>%
separate(label_copy, into = c("nb", "contig"), extra = "merge", remove =F) %>%
dplyr::mutate(nb = as.integer(nb)) %>%
dplyr::mutate(label_OG = label) %>%
dplyr::mutate(contig2 = stringr::str_replace(contig, "R_", "")) %>%
full_join(., temp2, by = c("contig2")) %>%
dplyr::mutate(label_temp = ifelse(is.na(sample), ifelse(is.na(contig), label, contig), sample)) %>%
unite(col = "label_new", nb, label_temp, sep = "_")
tree2 = as_tibble(treeio::read.tree("/Users/feurtey/Documents/Postdoc_Bruce/Projects/WW_project/4_TE_RIP/1_Blast_from_denovo_assemblies/Essai_dim2_tree.nwk"))
temp = tree %>%
dplyr::select(label, node, label_new, Continent_Region, Composite_median, Collection) %>%
mutate(Composite_median = ifelse(is.na(Composite_median), 0, Composite_median)) %>%
filter(!is.na(label)) %>%
mutate(RIP = ifelse(Composite_median >= 1.5, "High", ifelse(Composite_median <= 1, "Low", "Medium")))
#Find the outgroup!
root_node = tree2 %>%
filter(grepl("Zpa63", label)) %>%
dplyr::select(node) %>%
pull()
rooted.tree <- ape::root(as.treedata(tree2), root_node)
p <- ggtree(rooted.tree, layout = "rectangular") %<+% (temp %>% dplyr::select (-label))
p2 <- p + geom_tippoint(aes(color = Continent_Region)) +
theme(legend.position = "right") +
Color_Continent
p3 <- p + geom_tippoint(aes(color = RIP), alpha = 0.5) +
theme(legend.position = "right") +
scale_color_manual(values = c("darkred", "yellow", "orange"))
p4 <- p + geom_tippoint(aes(color = Collection), alpha = 0.5) +
theme(legend.position = "right")
cowplot::plot_grid(p2, p3)
df = temp %>% dplyr::select(RIP)
rownames(df) <- temp$label
gheatmap(p2, df, width=.2) +
scale_fill_manual(values = c("darkred", "yellow", "orange"), name = "RIP") +
labs(title = "Gene tree for the dim2 sequence",
y = "")
Based on these trees, there is some clustering according to geography. Additionally, it looks like the Middle-Eastern samples have either a high or low RIP level on their TE, whereas the level is rather in the middle of the range elsewhere…
If there is a relaxation of RIP, we could expect that TEs would not be the only things impacted but that gene duplications would also be possible more in RIP-relaxed genomes than in RIP-active.
Badet_pan_table = read_tsv(paste0(TE_RIP_dir, "Badet_GLOBAL_PANGENOME_TABLE.txt"))
Badet_pan_table %>%
dplyr::select(isolate, orthogroup, N_genes, N_isolates, cat) %>%
group_by(isolate) %>%
dplyr::mutate(nb_genes = n()) %>%
group_by(isolate, N_genes) %>%
dplyr::summarize(count = n(), nb_genes = mean(nb_genes)) %>%
dplyr::mutate(Percent = 100 * count / nb_genes) %>%
dplyr::filter(N_genes >= 2) %>%
dplyr::filter(N_genes <= 10) %>%
ggplot(aes(isolate, Percent, fill = N_genes)) +
geom_bar(stat = "identity") +
theme_bw() +
theme(axis.text.x = element_text(angle = 40, hjust = 1)) +
scale_fill_gradient(low = "#EAEAEA", high = "#294D4A", na.value = NA)
Badet_pan_table %>%
dplyr::select(isolate, orthogroup, N_genes, N_isolates, cat) %>%
group_by(isolate) %>%
dplyr::mutate(nb_genes = n()) %>%
dplyr::filter(N_genes >= 2) %>%
dplyr::mutate(N_genes_cat = ifelse(N_genes >= 9, "9 +", as.character(N_genes))) %>%
group_by(isolate, N_genes_cat) %>%
dplyr::summarize(count = n(), nb_genes = mean(nb_genes)) %>%
dplyr::mutate(Percent = 100 * count / nb_genes) %>%
ggplot(aes(isolate, Percent, fill = N_genes_cat)) +
geom_bar(stat = "identity") +
theme_bw() +
theme(axis.text.x = element_text(angle = 40, hjust = 1))
Badet_pan_table %>%
dplyr::select(isolate, orthogroup, N_genes, N_isolates, cat) %>%
group_by(isolate) %>%
dplyr::mutate(nb_genes = n()) %>%
dplyr::filter(N_genes >= 2) %>%
dplyr::mutate(N_genes_cat = ifelse(N_genes >= 9, "9 +", as.character(N_genes))) %>%
group_by(isolate, N_genes_cat) %>%
dplyr::summarize(count = n(), nb_genes = mean(nb_genes)) %>%
dplyr::mutate(Percent = 100 * count / nb_genes) %>%
dplyr::filter(N_genes_cat == 2) %>%
ggplot(aes(isolate, Percent, fill = N_genes_cat)) +
geom_bar(stat = "identity") +
theme_bw() +
theme(axis.text.x = element_text(angle = 40, hjust = 1))
temp = inner_join(sum_dim2_blast, duplicated_genes, by = c("sample" = "ID_file", "Continent_Region")) %>%
filter(Nb_duplicated_genes < 400)
p1 = ggplot(temp, aes(as.numeric(Composite_median),
Nb_duplicated_genes,
color = Continent_Region,
shape = Collection))+
geom_point() + Color_Continent +
theme_bw() +
labs(x = str_wrap(paste("RIP composite index",
" (median per isolate)"),
width = 20),
y = str_wrap("Number of duplicated genes",
width = 20))
p2 = ggplot(temp, aes(pident_wm,
Nb_duplicated_genes,
color = Continent_Region,
shape = Collection))+
geom_point() + Color_Continent +
theme_bw() + theme(legend.position = "none") +
labs(x = str_wrap(paste("Identity of the native match"),
width = 20),
y = str_wrap("Number of duplicated genes",
width = 20))
legend_b <- get_legend(
p1 +
theme(legend.position = "right")
)
top = cowplot::plot_grid(p1+ theme(legend.position = "none"), p2, ncol = 1)
cowplot::plot_grid(top, legend_b, nrow = 1)
For the following points, different types of data need to be collected.
Genomic data: ideally, we would work from both SNPs and CNV of genes as much as possible (not including TE at the moment, because too complex/uncertain?) + Add the structural variants if/when we can detect them from Illumina data?
Geographic/environmental data: For this, I will use the sampling site of each isolate (as precisely as I can manage) to approximate environmental parameters such as temperature, precipitation or solar radiation. One possibility to find such data is this website, https://www.worldclim.org/data/worldclim21.html (published in Fick and Hijmans, 2017), which gives access to climate data for 1970-2018. These can be transformed into bioclimatic variables using the biovars function from the R package dismo (https://rdrr.io/cran/dismo/man/biovars.html).
temp = Zt_meta_for_shiny %>% mutate(X = as.numeric(unlist(Longitude)),
Y = as.numeric(unlist(Latitude))) %>%
dplyr::select(X, Y) %>%
distinct()
sp = SpatialPoints(temp[, c("X", "Y")])
summary(sp)
## Object of class SpatialPoints
## Coordinates:
## min max
## X -122.9300 175.6576
## Y -46.2187 59.3294
## Is projected: NA
## proj4string : [NA]
## Number of points: 97
#Bioclim data (from Worldclim)
Bioclim_var = c("Annual Mean Temperature", "Mean Diurnal Range ",
"Isothermality (BIO2/BIO7) (×100)", "Temperature Seasonality (standard deviation ×100)",
"Max Temperature of Warmest Month", "Min Temperature of Coldest Month",
"Temperature Annual Range (BIO5-BIO6)",
"Mean Temperature of Wettest Quarter","Mean Temperature of Driest Quarter",
"Mean Temperature of Warmest Quarter", "Mean Temperature of Coldest Quarter",
"Annual Precipitation", "Precipitation of Wettest Month",
"Precipitation of Driest Month", "Precipitation Seasonality (Coefficient of Variation)",
"Precipitation of Wettest Quarter", "Precipitation of Driest Quarter",
"Precipitation of Warmest Quarter","Precipitation of Coldest Quarter)")
bio_list = list()
for (i in c(1:12)) {
file_name=paste0(data_dir,"Climatic_data/wc2.1_10m_bio/wc2.1_10m_bio_", i, ".tif")
rast_temp = raster(file_name)
bio_list[[Bioclim_var[i]]] = raster::extract(rast_temp, sp)
}
#Adding solar radiations
sol_list = list()
for (i in c(1:12)) {
file_name=paste0(data_dir,"Climatic_data/wc2.1_10m_srad/wc2.1_10m_srad_", str_pad(i, 2, pad ="0"), ".tif")
rast_temp = raster(file_name)
sol_list[[i]] = raster::extract(rast_temp, sp)
}
sol_months = cbind(temp, do.call(cbind, sol_list))
sol_months$srad_max = apply(sol_months[, c(3:ncol(sol_months))], 1, max)
sol_months = sol_months %>% dplyr::select(X,Y, srad_max)
climate_per_coordinates = cbind(temp, do.call(cbind, bio_list)) %>%
full_join(., sol_months)
dat = Zt_meta_for_shiny %>% dplyr::select(ID_file, Latitude, Longitude, Sampling_collection) %>%
full_join(., climate_per_coordinates,
by= c("Longitude" = "X", "Latitude" = "Y")) %>%
dplyr::select(-ID_file) %>%
gather(key = "Bioclim_var", value = "Estimate", -c(Longitude, Latitude, Sampling_collection))
#Define stable colors
myColors = c("#ec9a29", "#0f8b8d", "#143642")
names(myColors) = levels(factor(dat$Sampling_collection))
colScale = scale_colour_manual(name = "Sampling_collection", values = myColors)
colScaleFill = scale_fill_manual(name = "Sampling_collection", values = myColors)
ggplot(dat, aes(Estimate, fill =Sampling_collection, color =Sampling_collection)) +
geom_histogram(position = "stack") +
facet_wrap(.~Bioclim_var, scales = "free") +
theme_classic() + colScale + colScaleFill
Naturally, many of the variables investigated above are highly correlated. It is intuitive for example that the minimum temperature of the coldest month would be correlated to the average temperature of the coldest quarter! Here, I visualize these correlations.
#Correlogram
Ccor = cor(climate_per_coordinates)
corrplot(Ccor, type="lower", order="hclust",
col = mycolorsCorrel,
tl.col="black", tl.srt=45, tl.cex = 0.7)
#Simple heatmap
heatmap(x = Ccor, col = mycolorsCorrel, symm = TRUE)
How to choose the best variables?
country_climate = Zt_meta_for_shiny %>% dplyr::select(Latitude, Longitude, Continent_Region) %>% unique()
bioclim.pca <- prcomp(climate_per_coordinates[,c(2:ncol(climate_per_coordinates))], scale. = TRUE)
ggbiplot(bioclim.pca, obs.scale = 1, var.scale = 1,
group = country_climate$Continent_Region, ellipse = TRUE) +
Color_Continent +
theme(legend.direction = 'horizontal', legend.position = 'top') +
xlim(-10, 20) + ylim(-5, 20)
tidy_cors <- climate_per_coordinates %>%
dplyr::select(everything(), -contains("uarter"), -Y, -X) %>%
correlate() %>%
stretch()
graph_cors <- tidy_cors %>%
filter(abs(r) > .3) %>%
graph_from_data_frame(directed = FALSE)
ggraph(graph_cors) +
geom_edge_link(aes(edge_alpha = .5, edge_width = abs(r), color = r)) +
guides(edge_alpha = "none", edge_width = "none") +
scale_edge_colour_gradientn(limits = c(-1, 1), colors = c("#0f8b8d", "#a8201a")) +
geom_node_point(color = "black", size = 3) +
geom_node_text(aes(label = name), repel = TRUE) +
theme_graph() +
labs(title = "Correlations between bioclimatic variables")
Methods: Several software to explore:
datamash transpose \
< ${POPSTR}$VCFNAME.pass_noNA.GT.FORMAT2 \
> ${GEADIR}$VCFNAME.pass_noNA.GT.FORMAT2.transposed
#Create standardized values for the environment variables
climate_per_coord_standard = climate_per_coordinates %>%
mutate(Temp = scale(`Annual Mean Temperature`),
Rain = scale(`Annual Precipitation`)) %>%
dplyr::select(X, Y, Temp, Rain)
#Get genotypes
genotypes = read_tsv(paste0(GEA_dir, vcf_name, ".pass_noNA.GT.FORMAT2.transposed"),
col_names = F) %>%
dplyr::select(-X1) #removes ind_names
ind_list = read_tsv(paste0(PopStr_dir, vcf_name, ".pass_noNA.ind"), col_names = "ID_file")
genotypes_temp = bind_cols(genotypes, ind_list)
#Get environmental data
temp_Zt_meta = left_join(Zt_meta, climate_per_coord_standard, by = c("Latitude" = "Y", "Longitude" = "X"))
temp_Zt_meta = left_join(ind_list, temp_Zt_meta %>%
dplyr::select(ID_file, Temp, Rain),
tibble(name = target), by = "ID_file") %>%
dplyr::select(-ID_file) #removes ind_names
mod.lfmm <- lfmm_lasso(Y = genotypes, X = temp_Zt_meta, K=6,
nozero.prop = 0.02)
pv <- lfmm::lfmm_test(Y = genotypes,
X = temp_Zt_meta,
lfmm = mod.lfmm,
calibrate = "gif")
pvalues <- pv$calibrated.pvalue
I need to check whether the obtained p-values are similar to what we would expect. Unfortunately, not much in the two examples I have chosen!
#HARD CODED. THIS IS SUPER UGLY
par(mfrow=c(1, 2))
qqplot(rexp(length(pvalues[,"Rain"]), rate = log(10)),
-log10(pvalues), xlab = "Expected quantile",
pch = 19, cex = .4, col = "#82C0CC")
abline(0,1)
qqplot(rexp(length(pvalues[,"Temp"]), rate = log(10)),
-log10(pvalues), xlab = "Expected quantile",
pch = 19, cex = .4, col = "#FFA62B")
abline(0,1)
par(mfrow=c(1, 1))
Here are the resulting Manhattan plots. Seeing the two qqplots above, none of this should be taken too seriously!
positions = read_tsv(paste0(PopStr_dir, vcf_name, ".pass_noNA.GT.FORMAT.pos"), col_names = T)
## Manhattan plot
as_tibble(as.data.frame(pvalues)) %>%
mutate(SNP = c(1:nrow(pvalues))) %>%
gather(key = "Bioclimatic variable", value = "pvalue", -SNP) %>%
ggplot(aes(x=SNP, y = -log10(pvalue), color = `Bioclimatic variable`)) +
geom_point(alpha = 0.5) +
facet_wrap(~`Bioclimatic variable`, scales = "free") +
theme_bw() + scale_color_manual(values = c("#82C0CC", "#FFA62B"))
bind_cols(positions %>% dplyr::select(-POS), as_tibble(as.data.frame(pvalues))) %>%
mutate(SNP = c(1:nrow(pvalues))) %>%
gather(key = "Bioclimatic variable", value = "pvalue", -SNP, -CHROM) %>%
ggplot(aes(x=SNP, y = -log10(pvalue), color = factor(CHROM))) +
geom_point(alpha = 0.5) +
facet_grid(rows = vars(`Bioclimatic variable`), scales = "free") +
theme_bw() + scale_color_manual(values = rep(c("#489fb5", "#82c0cc"), 10))
for CHR in {1..13}
do
vcftools --vcf ${VCFDIR}$VCFNAME.recode.vcf \
--keep $ZTLIST.txt \
--remove-filtered-all --extract-FORMAT-info GT \
--max-missing 1.0 --min-alleles 2 --max-alleles 2 \
--maf 0.05 \
--chr ${CHR} \
--out ${GEADIR}Chr_${CHR}_only
vcftools --vcf ${VCFDIR}$VCFNAME.recode.vcf \
--keep $ZTLIST.txt \
--remove-filtered-all --extract-FORMAT-info GT \
--max-missing 1.0 --min-alleles 2 --max-alleles 2 \
--maf 0.05 \
--not-chr ${CHR} \
--out ${GEADIR}Chr_${CHR}_excepted
for i in only excepted ;
do
cat ${GEADIR}Chr_${CHR}_${i}.GT.FORMAT | cut -f 3- > ${GEADIR}Chr_${CHR}_${i}.GT.FORMAT2
done
done
from collections import defaultdict
#For each isolate, store its pop (as in sampling site) in a dictionary
dict_pop = dict(zip(r.Zt_meta["ID_file"], r.Zt_meta["Coordinates"]))
#Keep a list of the pop names/coordinates to write in the same order later
all_pops = list(set(r.Zt_meta["Coordinates"]))
bayenv_pop_name = r.GEA_dir + "SNPSFILE_pop"
with open(bayenv_pop_name, "w") as temp :
temp.write("\n".join(all_pops))
for chr in ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13"] :
print(chr)
for type in ["only", "excepted"] :
#Set output file and write coordinates as a header
bayenv_snps_name = r.GEA_dir + "Chr_" + str(chr) + "_" + type + ".SNPSFILE"
out = open(bayenv_snps_name, "w")
#Read the genotype file and estimate frequencies per pop
with open(r.GEA_dir + "Chr_" + str(chr) + "_" + type + ".GT.FORMAT2", "r") as input_snps :
for i, snp in enumerate(input_snps) :
#Setting two dictionaries with values at 0
dict_snp0 = defaultdict(int)
dict_snp1 = defaultdict(int)
Lets_write = True
#The first line is the name of the isolates
if i == 0 :
indv = snp.strip().split("\t")
Lets_write = False
else :
#Keeping isolate name and allelic value together
alleles = zip(indv, snp.strip().split("\t"))
#...and counting the O and 1 based on the pop
for ind, allele in alleles:
if allele == "0" :
dict_snp0[dict_pop[ind]] += 1
elif allele == "1" :
dict_snp1[dict_pop[ind]] += 1
else :
print("Only biallelic please!!!!")
Lets_write = False
#If I have not found anything weird, I will write the result to the output file.
if Lets_write :
total = sum([int(dict_snp0[pop]) for pop in all_pops]) + sum([int(dict_snp1[pop]) for pop in all_pops])
out.write("\t".join([str(dict_snp0[pop]) for pop in all_pops]) + "\n")
out.write("\t".join([str(dict_snp1[pop]) for pop in all_pops]) + "\n")
out.close()
#TODO paste0(vcf_name, ".pass_noNA.GT.FORMAT2.SNPSFILE_pop") -> py$bayenv_pop_name
climate_per_coord_ordered = left_join(read_tsv(paste0(GEA_dir, "SNPSFILE_pop") , col_names = F),
climate_per_coord_standard %>%
unite(X1, Y, X, remove = F, sep = ";"))
climate_per_coord_ordered %>%
dplyr::select(-X1, -X, -Y) %>%
mutate(Temp = round(Temp, 3), Rain = round(Rain, 3)) %>%
write_delim(paste0(GEA_dir, "ENVIRONFILE_totranspose"), delim = "\t", col_names = F)
nb_pop = nrow(read_tsv(paste0(GEA_dir, "SNPSFILE_pop"), col_names = F))
Sys.setenv(NUMPOPS=nb_pop)
echo $NUMPOPS
head -n 10000 ${GEADIR}SNPSFILE > temp ;
cp /Users/feurtey/Documents/Software/bayenv2_mac/bayenv2 ${GEADIR}
cd ${GEADIR}
${GEADIR}bayenv2 -i temp -p $NUMPOPS -k 50001 > temp.mat
tail -n $(($NUMPOPS + 1)) temp.mat > temp.MATRIXFILE
for CHR in {1..13}
do
head -n 10000 ${GEADIR}Chr_${CHR}_excepted.SNPSFILE > temp ;
${GEADIR}bayenv2 -i temp -p $NUMPOPS -k 50001 > temp.mat ;
tail -n $(($NUMPOPS + 1)) temp.mat > ${GEADIR}Chr_${CHR}_excepted.MATRIXFILE ;
done
datamash transpose < ${GEADIR}ENVIRONFILE_totranspose > ${GEADIR}ENVIRONFILE
NUMENV=$(cat ${GEADIR}ENVIRONFILE | wc -l)
cd ${GEADIR}
echo $NUMPOPS $NUMENV
for CHR in {1..13}
do
#To do: CHECK IF THE PER SNP LOOP DOES WHAT I THINK IT DOES
#Get number of SNPs
lnNB=$(cat ${GEADIR}Chr_${CHR}_only.SNPSFILE | wc -l)
SNPNB=$(($lnNB/2))
echo $CHR, $SNPNB
#Run bayenv per snp
for i in $(seq 1 ${SNPNB}) ;
do
CHOSENLINES=$(($i*2))
head -n $CHOSENLINES ${GEADIR}Chr_${CHR}_only.SNPSFILE | tail -n 2 > temp.SNPFILE
./bayenv2 \
-i temp.SNPFILE \
-e ENVIRONFILE \
-m Chr_${CHR}_excepted.MATRIXFILE \
-p ${NUMPOPS} -n ${NUMENV} \
-k 10000 -t \
-o Chr_${CHR}.Bayenv2_out
done
done
CHR=7
temp = read_tsv(paste0(GEA_dir, "Chr_", CHR, ".Bayenv2_out.bf"), col_names = F)
temp$ID <- seq.int(nrow(temp))
temp %>%
ggplot(aes(ID, X2)) + geom_point()
This is based on the same scripts made for Fanny Hartmann’s paper on the pairs of populations: looking for alleles already known to be involved in resistance to fungicide.
genotypes_resistance = read_tsv(paste0(fung_dir, "genotypes_tidy_format.tab"),
col_names = c("temp", "ID_file", "Allele")) %>%
separate(temp, into = c("Gene", "AA_change"), sep = "\\.") %>%
dplyr::mutate(AA_REF = str_sub(AA_change, 1, 1)) %>%
dplyr::mutate(Allele_type = ifelse(AA_REF == Allele, "Origin", "Mutant")) %>%
left_join(Zt_meta)
genotypes_resistance %>%
dplyr::count(AA_change, Gene, Allele_type) %>%
ggplot(aes(x=n, y=AA_change, fill = Allele_type)) +
geom_barh(stat="identity") +
facet_grid(Gene ~ ., scales = "free_y", space = "free_y")
I suspect that there will be an effect of both geography and time on the frequency of these alleles and so I try to represent this here.
#CYP51
temp = genotypes_resistance %>%
filter(Gene == "CYP51") %>%
dplyr::count(AA_change, Continent_Region,Date_Year, Allele_type) %>%
pivot_wider(names_from = Allele_type, values_from = n, values_fill = list(n = 0)) %>%
mutate(prop_mutant = Mutant/(Mutant + Origin)) %>%
filter(!is.na(Date_Year)) %>%
filter(!is.na(Continent_Region)) %>%
group_by(AA_change) %>%
dplyr::mutate(Somme = sum(Mutant)) %>%
filter(Somme > 0)
temp %>%
ggplot(aes(x=Date_Year, y=Continent_Region, size = prop_mutant, fill = prop_mutant)) +
geom_point(shape=21, col = myColors[3], alpha =0.7) + theme_bw() +
facet_wrap(.~AA_change) + scale_fill_gradient(low="white", high = "#16697a") +
labs(title = str_wrap(paste("Mutant proportion per known mutation",
"in the gene CYP51"), width = 35),
subtitle = str_wrap(paste("This gene can mutate and cause resistance to azole fungicides."), width = 65),
fill = "Proportion of mutants",
size = "Proportion of mutants",
x = "Years", y = "")
#Beta_tubulin
genotypes_resistance %>%
filter(Gene == "beta_tubulin") %>%
dplyr::count(AA_change, Continent_Region,Date_Year, Allele_type) %>%
pivot_wider(names_from = Allele_type, values_from = n, values_fill = list(n = 0)) %>%
mutate(Number_all = Mutant + Origin) %>%
mutate(prop_mutant = Mutant/Number_all) %>%
group_by(AA_change) %>%
dplyr::mutate(Somme = sum(Mutant)) %>%
filter(Somme > 0) %>%
ggplot(aes(x=Date_Year, y=Continent_Region, size = prop_mutant, fill = prop_mutant)) +
geom_point(shape=21, col = myColors[3], alpha =0.7) + theme_bw() +
facet_wrap(.~AA_change) + scale_fill_gradient(low="white", high = "#16697a")+
labs(title = str_wrap(paste("Mutant proportion per known mutation",
"in the beta tubuline gene"), width = 35),
subtitle = str_wrap(paste("This gene can mutate and cause resistance",
" to benzimidazole fungicides."), width = 65),
fill = "Proportion of mutants",
size = "Proportion of mutants",
x = "Years", y = "")
# Mitochondrial_cytb
genotypes_resistance %>%
filter(Gene == "mitochondrial_cytb") %>%
dplyr::count(AA_change, Continent_Region,Date_Year, Allele_type) %>%
pivot_wider(names_from = Allele_type, values_from = n, values_fill = list(n = 0)) %>%
mutate(Number_all = Mutant + Origin) %>%
mutate(prop_mutant = Mutant/Number_all) %>%
group_by(AA_change) %>%
dplyr::mutate(Somme = sum(Mutant)) %>%
filter(Somme > 0) %>%
ggplot(aes(x=Date_Year, y=Continent_Region, size = prop_mutant, fill = prop_mutant)) +
geom_point(shape=21, col = myColors[3], alpha =0.7) + theme_bw() +
facet_wrap(.~AA_change) + scale_fill_gradient(low="white", high = "#16697a") +
labs(title = str_wrap(paste("Mutant proportion per known mutation",
"in the mitochondrial gene cytb"), width = 35),
subtitle = str_wrap(paste("This gene can mutate and cause resistance ",
"to QoI, or Quinone outside inhibitors."), width = 65),
fill = "Proportion of mutants",
size = "Proportion of mutants",
x = "Years", y = "")
# SDH genes
genotypes_resistance %>%
filter(grepl("SDH", Gene)) %>%
unite(Label, Gene, AA_change, remove = T) %>%
dplyr::count(Label, Continent_Region, Date_Year, Allele_type) %>%
pivot_wider(names_from = Allele_type, values_from = n, values_fill = list(n = 0)) %>%
mutate(Number_all = Mutant + Origin) %>%
mutate(prop_mutant = Mutant/Number_all) %>%
group_by(Label) %>%
dplyr::mutate(Somme = sum(Mutant)) %>%
filter(Somme > 0) %>%
ggplot(aes(x=Date_Year, y=Continent_Region, size = prop_mutant, fill = prop_mutant)) +
geom_point(shape=21, col = myColors[3], alpha =0.7) + theme_bw() +
facet_wrap(.~Label) + scale_fill_gradient(low="white", high = "#16697a")+
labs(title = str_wrap(paste("Mutant proportion per known mutation",
"in one of the SDH genes"), width = 35),
subtitle = str_wrap(paste("This gene can mutate and cause resistance to SDHI fungicides."), width = 65),
fill = "Proportion of mutants",
size = "Proportion of mutants",
x = "Years", y = "")
I use the alleles identified in the GWAS of Thierry Marcel and track them in the world-wide populations.
virulence_table = read_tsv(paste0(virulence_dir, "French_GWAS_virulence_alleles.tab")) %>%
separate(VIRULENCE_ALLELES, into = c("VIRULENCE_ALLELE1", "VIRULENCE_ALLELE2"), sep = ",")
french_GWAS_variants = read_tsv(paste0(virulence_dir, "Positions_GWAS.short.vcf"),
na = ".") %>%
pivot_longer(-c(`#CHROM`, POS, REF, ALT), names_to = "ID_file", values_to = "Allele_nb") %>%
separate(ALT, into = c("ALT1", "ALT2"), sep = ",") %>%
mutate(Allele_ID = ifelse(Allele_nb == 0, REF, ifelse(Allele_nb == 1, ALT1, ALT2))) %>%
left_join(., virulence_table) %>%
mutate(virulence = ifelse(Allele_ID == NON_VIRULENT_ALLELES, "Non_virulent",
ifelse(Allele_ID == VIRULENCE_ALLELE1 | Allele_ID == VIRULENCE_ALLELE2,
"Virulent", NA)))
french_GWAS_variants = inner_join(french_GWAS_variants, Zt_meta)
french_GWAS_variants %>%
tidyr::unite(position, `#CHROM`, POS) %>%
dplyr::count(position, Continent_Region,Date_Year, virulence) %>%
pivot_wider(names_from = virulence, values_from = n, values_fill = list(n = 0)) %>%
mutate(Number_all = Virulent + Non_virulent) %>%
mutate(prop_virulent = Virulent/Number_all) %>%
ggplot(aes(x=Date_Year, y=Continent_Region, size = prop_virulent, fill = prop_virulent)) +
geom_point(shape=21, alpha =0.7) + theme_bw() +
scale_fill_gradient(low="white", high = "#16697a")+
labs(title = str_wrap(paste("Virulence alleles",
""), width = 35),
subtitle = str_wrap(paste("These alleles were identified in a GWAS study",
" to increase either virulence or aggressiveness",
" on some wheat varieties."), width = 65),
fill = "Proportion of virulent",
size = "Proportion of virulent",
x = "Years", y = "") +
facet_wrap(vars(position))
Welll… Whatever…
#On the cluster: run blast
while read sample ; do sh Detect_gene_blast.sh ./Directories_new.sh /data2/alice/WW_project/7_Virulence/Avr_Stb6.fasta ${sample} Illumina /data2/alice/WW_project/7_Virulence/${sample}_Avr_Stb6.blast.tab; done < list_Third_batch_Dec_2018.txt
#On the cluster: Gather blast results
cat /data2/alice/WW_project/7_Virulence/*Avr_Stb6.blast.tab > /data2/alice/WW_project/7_Virulence/Overall_results_blast_Avr_Stb6.txt
#On the cluster: Gather blast results
while read p ;
do
sample=$(echo $p | cut -f 1 -d " " ) ;
chr=$(echo $p | cut -f 4 -d " ") ;
start=$(echo $p | cut -f 11 -d " ") ; end=$(echo $p | cut -f 12 -d " ") ;
echo $sample, $chr, $start, $end ;
order_start=$(echo -e "$start\n$end" | sort -n | head -n1);
order_end=$(echo -e "$start\n$end" | sort -nr | head -n1);
echo -e "${chr}\t${order_start}\t${order_end}" > temp.bed ;
~/Software/bedtools getfasta \
-fi /data2/alice/WW_project/4_TE_RIP/1_Blast_from_denovo_assemblies/0_Spades/${sample}.fasta\
-bed temp.bed | \
sed "s/>/>${sample}:/" >> \
/data2/alice/WW_project/7_Virulence/Overall_results_blast_Avr_Stb6.fasta ;
done < /data2/alice/WW_project/7_Virulence/Overall_results_blast_Avr_Stb6.txt
#And on the website, because laziness
mafft --thread 8 --threadtb 5 --threadit 0 --reorder --adjustdirection --auto input > output
Here is the tree for the Avr_Stb6 gene. For reference, the allele carried by ST99CH_1A4 gives an avirulent phenotype, while the allele from ST99CH_1A5 is the virulent allele (from Zhong et al. 2017).
#grep ">" ../7_Virulence/Overall_results_blast_Avr_Stb6.fasta | sed 's/>//' > ../7_Virulence/Overall_results_blast_Avr_Stb6.list
temp2 = read_tsv(paste0(virulence_dir, "Overall_results_blast_Avr_Stb6.list"), col_names = "Fasta_header") %>%
separate(Fasta_header, into = c("ID_file", "chr", "coords"), sep=":", remove = F) %>%
mutate(contig2 = stringr::str_replace_all(Fasta_header, "[:.]", "_")) %>%
left_join(.,Zt_meta)
#Read tree in
#details from the mafft website about the tree
#Size = 640 sequences × 319 sites
#Method = Neighbor-Joining
#Model = Jukes-Cantor
#Bootstrap resampling = 0
#Alignment id = .200930173318269H5fPcXaMbnDOkoF7E0Y3ilsfnormal
tree_file = "Tree_Avr_Stb6.nwk"
tree = as_tibble(treeio::read.tree(paste0(virulence_dir, tree_file))) %>%
mutate(label_copy = label) %>%
separate(label_copy, into = c("nb", "contig"), extra = "merge", remove =F) %>%
dplyr::mutate(nb = as.integer(nb)) %>%
dplyr::mutate(label_OG = label) %>%
dplyr::mutate(contig2 = stringr::str_replace(contig, "R_", "")) %>%
left_join(., temp2, by = c("contig2")) %>%
dplyr::mutate(label_temp = ifelse(is.na(ID_file), ifelse(is.na(contig), label, contig), ID_file)) %>%
unite(col = "label_new", nb, label_temp, sep = "_")
tree2 = as_tibble(treeio::read.tree(paste0(virulence_dir, tree_file)))
temp = tree %>%
mutate(original_vir = ifelse(ID_file == "ST99CH_1A5", "ST99CH_1A5",
ifelse(ID_file == "ST99CH_1E4", "ST99CH_1E4", NA))) %>%
dplyr::select(label, node, label_new, Continent_Region, Date_Year, Collection, original_vir) %>%
filter(!is.na(label))
#Find the outgroup!
root_node = tree2 %>%
filter(grepl("Zp13", label)) %>%
dplyr::select(node) %>%
pull()
rooted.tree <- ape::root(as.treedata(tree2), root_node)
p <- ggtree(rooted.tree, layout = "rectangular") %<+% (temp %>% dplyr::select (-label))
p2 <- p + geom_tippoint(aes(color = Continent_Region), alpha = 0.7) +
theme(legend.position = "right") +
Color_Continent
#p2
p3 <- p + geom_tippoint(aes(color = original_vir)) +
theme(legend.position = "right") +
scale_color_manual(values = c(mycolorsCorrel[1], mycolorsCorrel[20]))
cowplot::plot_grid(p2, p3)
p4 <- p + geom_tippoint(aes(color = Date_Year), alpha = 0.7) +
theme(legend.position = "right")
cowplot::plot_grid(p4, p3)